diff options
-rw-r--r-- | include/clblast.h | 222 | ||||
-rw-r--r-- | include/clblast_c.h | 436 | ||||
-rw-r--r-- | scripts/generator/generator.py | 68 | ||||
-rw-r--r-- | scripts/generator/routine.py | 100 | ||||
-rw-r--r-- | src/clblast.cc | 654 | ||||
-rw-r--r-- | src/clblast_c.cc | 1058 | ||||
-rw-r--r-- | test/wrapper_clblas.h | 1056 |
7 files changed, 3533 insertions, 61 deletions
diff --git a/include/clblast.h b/include/clblast.h index 953e6953..70a3b5bc 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -144,7 +144,7 @@ StatusCode Dotc(const size_t n, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV template <typename T> StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -155,6 +155,17 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +template <typename T> +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template <typename T> StatusCode Hemv(const Layout layout, const Triangle triangle, @@ -166,6 +177,28 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template <typename T> +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template <typename T> +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Symmetric matrix-vector multiplication: SSYMV/DSYMV template <typename T> StatusCode Symv(const Layout layout, const Triangle triangle, @@ -177,11 +210,187 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +template <typename T> +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +template <typename T> +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +template <typename T> +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +template <typename T> +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +template <typename T> +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template <typename T> +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template <typename T> +StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template <typename T> +StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// General rank-1 matrix update: SGER/DGER +template <typename T> +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex matrix update: CGERU/ZGERU +template <typename T> +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template <typename T> +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-1 matrix update: CHER/ZHER +template <typename T> +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template <typename T> +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template <typename T> +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template <typename T> +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-1 matrix update: SSYR/DSYR +template <typename T> +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +template <typename T> +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +template <typename T> +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +template <typename T> +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template <typename T> StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -265,6 +474,15 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template <typename T> +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= } // namespace clblast diff --git a/include/clblast_c.h b/include/clblast_c.h index 56507625..fac39a58 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -199,7 +199,7 @@ StatusCode CLBlastZdotc(const size_t n, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const float alpha, @@ -233,6 +233,40 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Hermitian matrix-vector multiplication: CHEMV/ZHEMV StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, const size_t n, @@ -251,6 +285,42 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Symmetric matrix-vector multiplication: SSYMV/DSYMV StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, const size_t n, @@ -269,11 +339,347 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// General rank-1 matrix update: SGER/DGER +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex matrix update: CGERU/ZGERU +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-1 matrix update: CHER/ZHER +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-1 matrix update: SSYR/DSYR +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, @@ -483,6 +889,32 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // CLBLAST_CLBLAST_C_H_ diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 699cd9cf..9c9675b8 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -48,29 +48,55 @@ TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for # Populates a list of routines routines = [ -[ # Level 1 - Routine(True, 1, "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), - Routine(True, 1, "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), - Routine(True, 1, "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), - Routine(True, 1, "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"), - Routine(True, 1, "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), - Routine(True, 1, "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), - Routine(True, 1, "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), +[ # Level 1: vector-vector + #Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"), + #Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"), + Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), + Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), + Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), + Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"), + Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), + Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), + Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), ], -[ # Level 2 - Routine(True, 2, "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "Generalized matrix-vector multiplication"), - Routine(True, 2, "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), - Routine(True, 2, "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), +[ # Level 2: matrix-vector + Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), + Routine(False, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), + Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), + Routine(False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"), + Routine(False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"), + Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), + Routine(False, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"), + Routine(False, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"), + Routine(False, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"), + Routine(False, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"), + Routine(False, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"), + Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"), + Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"), + Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"), + # Level 2: matrix update + Routine(False, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"), + Routine(False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"), + Routine(False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"), + Routine(False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"), + Routine(False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"), + Routine(False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"), + Routine(False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"), + Routine(False, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"), + Routine(False, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"), + Routine(False, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"), + Routine(False, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"), ], -[ # Level 3 - Routine(True, 3, "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Generalized matrix-matrix multiplication"), - Routine(True, 3, "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"), - Routine(True, 3, "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"), - Routine(True, 3, "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"), - Routine(True, 3, "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"), - Routine(True, 3, "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"), - Routine(True, 3, "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"), - Routine(True, 3, "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"), +[ # Level 3: matrix-matrix + Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"), + Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"), + Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"), + Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"), + Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"), + Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"), + Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"), + Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"), + Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Solves a triangular system of equations"), ]] # ================================================================================================== diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index b2c50e3d..df4dd019 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -12,6 +12,9 @@ # # ================================================================================================== +# System modules +from itertools import chain + # Translates an option name to a CLBlast data-type def OptionToCLBlast(x): return { @@ -36,6 +39,9 @@ def OptionToWrapper(x): 'diagonal': "clblasDiag", }[x] +# Buffers without 'ld' or 'inc' parameter +NO_LD_INC = ["dot","ap"] + # ================================================================================================== # Class holding routine-specific information (e.g. name, which arguments, which precisions) @@ -71,6 +77,16 @@ class Routine(): def ShortNames(self): return "/".join([f.name+self.name.upper() for f in self.flavours]) + # Determines which buffers go first (between alpha and beta) and which ones go after + def BuffersFirst(self): + if self.level == "2b": + return ["x","y"] + return ["ap","a","b","x"] + def BuffersSecond(self): + if self.level == "2b": + return ["ap","a","b","c"] + return ["y","c"] + # ============================================================================================== # Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x') @@ -78,7 +94,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [name+"_buffer"] b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -88,7 +104,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [prefix+"cl_mem "+name+"_buffer"] b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -97,7 +113,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"] b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -120,7 +136,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [prefix+"cl_mem"] b = ["const size_t"] - c = ["const size_t"] if (name not in ["dot"]) else [] + c = ["const size_t"] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -134,41 +150,45 @@ class Routine(): # Retrieves the use of a scalar (alpha/beta) def ScalarUse(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return [flavour.UseAlpha()] - elif ((name == "beta") and (name in self.scalars)): - return [flavour.UseBeta()] + if name in self.scalars: + if name == "alpha": + return [flavour.UseAlpha()] + elif name == "beta": + return [flavour.UseBeta()] + return [name] return [] # Retrieves the use of a scalar (alpha/beta) def ScalarUseWrapper(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return [flavour.UseAlphaCL()] - elif ((name == "beta") and (name in self.scalars)): - return [flavour.UseBetaCL()] + if name in self.scalars: + if name == "alpha": + return [flavour.UseAlphaCL()] + elif name == "beta": + return [flavour.UseBetaCL()] + return [name] return [] # Retrieves the definition of a scalar (alpha/beta) def ScalarDef(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return ["const "+flavour.alpha_cl+" "+name] - elif ((name == "beta") and (name in self.scalars)): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cl+" "+name] return ["const "+flavour.beta_cl+" "+name] return [] # As above, but without 'cl_' prefix def ScalarDefPlain(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return ["const "+flavour.alpha_cpp+" "+name] - elif ((name == "beta") and (name in self.scalars)): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cpp+" "+name] return ["const "+flavour.beta_cpp+" "+name] return [] # Retrieves the type of a scalar (alpha/beta) def ScalarType(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return ["const "+flavour.alpha_cpp] - elif ((name == "beta") and (name in self.scalars)): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cpp] return ["const "+flavour.beta_cpp] return [] @@ -234,43 +254,55 @@ class Routine(): def ArgumentsCladuc(self, flavour, indent): return (self.Options() + self.Sizes() + self.BufferCladuc("dot") + self.Scalar("alpha") + - self.BufferCladuc("a") + self.BufferCladuc("b") + self.BufferCladuc("x") + - self.Scalar("beta") + self.BufferCladuc("y") + self.BufferCladuc("c")) + list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + + self.Scalar("beta") + + list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) + + list(chain(*[self.Scalar(s) for s in ["d1","d2","a","b","c","s"]]))) # Retrieves a combination of all the argument names, with CLBlast casts def ArgumentsCast(self, flavour, indent): return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") + self.ScalarUse("alpha", flavour) + - self.Buffer("a") + self.Buffer("b") + self.Buffer("x") + - self.ScalarUse("beta", flavour) + self.Buffer("y") + self.Buffer("c")) + list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + + self.ScalarUse("beta", flavour) + + list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarUse(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # As above, but for the clBLAS wrapper def ArgumentsWrapper(self, flavour): return (self.Options() + self.Sizes() + self.BufferWrapper("dot") + self.ScalarUseWrapper("alpha", flavour) + - self.BufferWrapper("a") + self.BufferWrapper("b") + self.BufferWrapper("x") + - self.ScalarUseWrapper("beta", flavour) + self.BufferWrapper("y") + self.BufferWrapper("c")) + list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + + self.ScalarUseWrapper("beta", flavour) + + list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") + self.ScalarDef("alpha", flavour) + - self.BufferDef("a") + self.BufferDef("b") + self.BufferDef("x") + - self.ScalarDef("beta", flavour) + self.BufferDef("y") + self.BufferDef("c")) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + self.ScalarDef("beta", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarDef(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # As above, but clBLAS wrapper plain datatypes def ArgumentsDefWrapper(self, flavour): return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") + self.ScalarDefPlain("alpha", flavour) + - self.BufferDef("a") + self.BufferDef("b") + self.BufferDef("x") + - self.ScalarDefPlain("beta", flavour) + self.BufferDef("y") + self.BufferDef("c")) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + self.ScalarDefPlain("beta", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): return (self.OptionsType() + self.SizesType() + self.BufferType("dot") + self.ScalarType("alpha", flavour) + - self.BufferType("a") + self.BufferType("b") + self.BufferType("x") + - self.ScalarType("beta", flavour) + self.BufferType("y") + self.BufferType("c")) + list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + + self.ScalarType("beta", flavour) + + list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarType(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # ============================================================================================== @@ -290,7 +322,7 @@ class Routine(): result = "template <"+self.template.name+">\n" result += "StatusCode "+self.name.capitalize()+"(" result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)]) - result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + result += ",\n"+indent+"cl_command_queue*, cl_event*)" return result # As above, but now for C diff --git a/src/clblast.cc b/src/clblast.cc index 0ced9ff7..a0dd8c70 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -269,7 +269,7 @@ template StatusCode Dotc<double2>(const size_t, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV template <typename T> StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -325,6 +325,51 @@ template StatusCode Gemv<double2>(const Layout, const Transpose, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +template <typename T> +StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Gbmv<float>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv<double>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv<float2>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv<double2>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template <typename T> StatusCode Hemv(const Layout layout, const Triangle triangle, @@ -365,6 +410,64 @@ template StatusCode Hemv<double2>(const Layout, const Triangle, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template <typename T> +StatusCode Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hbmv<float2>(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hbmv<double2>(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template <typename T> +StatusCode Hpmv(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpmv<float2>(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpmv<double2>(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // Symmetric matrix-vector multiplication: SSYMV/DSYMV template <typename T> StatusCode Symv(const Layout layout, const Triangle triangle, @@ -405,11 +508,523 @@ template StatusCode Symv<double>(const Layout, const Triangle, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +template <typename T> +StatusCode Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Sbmv<float>(const Layout, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Sbmv<double>(const Layout, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +template <typename T> +StatusCode Spmv(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spmv<float>(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spmv<double>(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +template <typename T> +StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +template <typename T> +StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +template <typename T> +StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template <typename T> +StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template <typename T> +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tbsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template <typename T> +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tpsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 matrix update: SGER/DGER +template <typename T> +StatusCode Ger(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Ger<float>(const Layout, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Ger<double>(const Layout, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 complex matrix update: CGERU/ZGERU +template <typename T> +StatusCode Geru(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Geru<float2>(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Geru<double2>(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template <typename T> +StatusCode Gerc(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Gerc<float2>(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gerc<double2>(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-1 matrix update: CHER/ZHER +template <typename T> +StatusCode Her(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Her<float>(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Her<double>(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template <typename T> +StatusCode Hpr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpr<float>(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpr<double>(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template <typename T> +StatusCode Her2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Her2<float2>(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Her2<double2>(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template <typename T> +StatusCode Hpr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpr2<float2>(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpr2<double2>(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-1 matrix update: SSYR/DSYR +template <typename T> +StatusCode Syr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Syr<float>(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr<double>(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +template <typename T> +StatusCode Spr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spr<float>(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spr<double>(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +template <typename T> +StatusCode Syr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Syr2<float>(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2<double>(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +template <typename T> +StatusCode Spr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spr2<float>(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spr2<double>(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template <typename T> StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -787,5 +1402,40 @@ template StatusCode Trmm<double2>(const Layout, const Side, const Triangle, cons cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template <typename T> +StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsm<double>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsm<float2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsm<double2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index eccf517f..fcec0951 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -363,6 +363,84 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, return static_cast<StatusCode>(status); } +// GBMV +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} + // HEMV StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, const size_t n, @@ -403,6 +481,86 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, return static_cast<StatusCode>(status); } +// HBMV +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// HPMV +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} + // SYMV StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, const size_t n, @@ -443,6 +601,832 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, return static_cast<StatusCode>(status); } +// SBMV +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// SPMV +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// TRMV +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// TBMV +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// TPMV +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// TRSV +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// TBSV +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// TPSV +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} + +// GER +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast<clblast::Layout>(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast<clblast::Layout>(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// GERU +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Geru(static_cast<clblast::Layout>(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Geru(static_cast<clblast::Layout>(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// GERC +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gerc(static_cast<clblast::Layout>(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gerc(static_cast<clblast::Layout>(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// HER +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// HPR +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} + +// HER2 +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// HPR2 +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} + +// SYR +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// SPR +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} + +// SYR2 +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// SPR2 +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= @@ -963,4 +1947,78 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri return static_cast<StatusCode>(status); } +// TRSM +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} + // ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 85729470..10c7dd47 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -428,6 +428,80 @@ clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_trans num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + beta, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + beta, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgbmv(layout, a_transpose, + m, n, kl, ku, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgbmv(layout, a_transpose, + m, n, kl, ku, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + // Forwards the clBLAS calls for CHEMV/ZHEMV clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, @@ -466,6 +540,82 @@ clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for CHBMV/ZHBMV +clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChbmv(layout, triangle, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhbmv(layout, triangle, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPMV/ZHPMV +clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpmv(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpmv(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + // Forwards the clBLAS calls for SSYMV/DSYMV clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, @@ -504,6 +654,854 @@ clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SSBMV/DSBMV +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsbmv(layout, triangle, + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + beta, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsbmv(layout, triangle, + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + beta, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPMV/DSPMV +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspmv(layout, triangle, + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + beta, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspmv(layout, triangle, + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + beta, + y_buffer, y_offset, static_cast<int>(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV +template <typename T> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<float>(context, n); + return clblasStrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<double>(context, n); + return clblasDtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<float2>(context, n); + return clblasCtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<double2>(context, n); + return clblasZtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV +template <typename T> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<float>(context, n); + return clblasStbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<double>(context, n); + return clblasDtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<float2>(context, n); + return clblasCtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<double2>(context, n); + return clblasZtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV +template <typename T> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<float>(context, n); + return clblasStpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<double>(context, n); + return clblasDtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<float2>(context, n); + return clblasCtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer<double2>(context, n); + return clblasZtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV +template <typename T> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtrsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV +template <typename T> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtbsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV +template <typename T> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtpsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast<int>(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SGER/DGER +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSger(layout, + m, n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDger(layout, + m, n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CGERU/ZGERU +clblasStatus clblasXgeru(const clblasOrder layout, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgeru(layout, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgeru(const clblasOrder layout, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgeru(layout, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CGERC/ZGERC +clblasStatus clblasXgerc(const clblasOrder layout, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgerc(layout, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgerc(const clblasOrder layout, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgerc(layout, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHER/ZHER +clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPR/ZHPR +clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHER2/ZHER2 +clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher2(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher2(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPR2/ZHPR2 +clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpr2(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpr2(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYR/DSYR +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPR/DSPR +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYR2/DSYR2 +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPR2/DSPR2 +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast<int>(x_inc), + y_buffer, y_offset, static_cast<int>(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= @@ -964,6 +1962,64 @@ clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= } // namespace clblast |