From f96fd372bc3087938572ebc55bd1d8e1b7e6f18a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 14:28:52 +0200 Subject: Added initial version of a Netlib CBLAS implementation. TODO: Set correct buffer sizes --- CMakeLists.txt | 2 + include/clblast_blas.h | 158 +- include/clblast_c.h | 5 - scripts/generator/generator.py | 108 +- scripts/generator/generator/cpp.py | 64 +- scripts/generator/generator/datatype.py | 16 + scripts/generator/generator/routine.py | 59 +- src/clblast_blas.cpp | 4651 +++++++++++++++++++++++++++++++ 8 files changed, 4817 insertions(+), 246 deletions(-) create mode 100644 src/clblast_blas.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index f5edbd75..d2034617 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,7 @@ set(PRECISIONS 32 64 3232 6464 16) # Gathers all source-files set(SOURCES + src/clblast_blas.cpp src/database/database.cpp src/routines/common.cpp src/utilities/clblast_exceptions.cpp @@ -213,6 +214,7 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) +install(FILES include/clblast_blas.h DESTINATION include) # Installs the config for find_package in dependent projects install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) diff --git a/include/clblast_blas.h b/include/clblast_blas.h index a5d0cc9c..b4db4192 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -18,8 +18,8 @@ // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#ifdef _WIN32 - #ifdef COMPILING_DLL +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) #define PUBLIC_API __declspec(dllexport) #else #define PUBLIC_API __declspec(dllimport) @@ -42,6 +42,7 @@ typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle; typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal; typedef enum Side_ { kLeft = 141, kRight = 142 } Side; + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= @@ -103,9 +104,6 @@ void PUBLIC_API cblas_cswap(const int n, void PUBLIC_API cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_hswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL void PUBLIC_API cblas_sscal(const int n, @@ -120,9 +118,6 @@ void PUBLIC_API cblas_cscal(const int n, void PUBLIC_API cblas_zscal(const int n, const void* alpha, void* x, const int x_inc); -void PUBLIC_API cblas_hscal(const int n, - const void* alpha, - void* x, const int x_inc); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY void PUBLIC_API cblas_scopy(const int n, @@ -137,9 +132,6 @@ void PUBLIC_API cblas_ccopy(const int n, void PUBLIC_API cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_hcopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY void PUBLIC_API cblas_saxpy(const int n, @@ -158,10 +150,6 @@ void PUBLIC_API cblas_zaxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_haxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT void PUBLIC_API cblas_sdot(const int n, @@ -172,10 +160,6 @@ void PUBLIC_API cblas_ddot(const int n, double* dot, const double* x, const int x_inc, const double* y, const int y_inc); -void PUBLIC_API cblas_hdot(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU void PUBLIC_API cblas_cdotu(const int n, @@ -210,9 +194,6 @@ void PUBLIC_API cblas_scnrm2(const int n, void PUBLIC_API cblas_dznrm2(const int n, void* nrm2, const void* x, const int x_inc); -void PUBLIC_API cblas_hnrm2(const int n, - void* nrm2, - const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM void PUBLIC_API cblas_sasum(const int n, @@ -227,9 +208,6 @@ void PUBLIC_API cblas_scasum(const int n, void PUBLIC_API cblas_dzasum(const int n, void* asum, const void* x, const int x_inc); -void PUBLIC_API cblas_hasum(const int n, - void* asum, - const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM void PUBLIC_API cblas_ssum(const int n, @@ -244,9 +222,6 @@ void PUBLIC_API cblas_scsum(const int n, void PUBLIC_API cblas_dzsum(const int n, void* sum, const void* x, const int x_inc); -void PUBLIC_API cblas_hsum(const int n, - void* sum, - const void* x, const int x_inc); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void PUBLIC_API cblas_isamax(const int n, @@ -261,9 +236,6 @@ void PUBLIC_API cblas_icamax(const int n, void PUBLIC_API cblas_izamax(const int n, void* imax, const void* x, const int x_inc); -void PUBLIC_API cblas_ihamax(const int n, - void* imax, - const void* x, const int x_inc); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX void PUBLIC_API cblas_ismax(const int n, @@ -278,9 +250,6 @@ void PUBLIC_API cblas_icmax(const int n, void PUBLIC_API cblas_izmax(const int n, void* imax, const void* x, const int x_inc); -void PUBLIC_API cblas_ihmax(const int n, - void* imax, - const void* x, const int x_inc); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN void PUBLIC_API cblas_ismin(const int n, @@ -295,9 +264,6 @@ void PUBLIC_API cblas_icmin(const int n, void PUBLIC_API cblas_izmin(const int n, void* imin, const void* x, const int x_inc); -void PUBLIC_API cblas_ihmin(const int n, - void* imin, - const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -332,13 +298,6 @@ void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_hgemv(const Layout layout, const Transpose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, @@ -369,13 +328,6 @@ void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_hgbmv(const Layout layout, const Transpose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, @@ -440,13 +392,6 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hsymv(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, @@ -463,13 +408,6 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hsbmv(const Layout layout, const Triangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, @@ -486,13 +424,6 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hspmv(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -511,10 +442,6 @@ void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const const int n, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_htrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -533,10 +460,6 @@ void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_htbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -555,10 +478,6 @@ void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const const int n, const void* ap, void* x, const int x_inc); -void PUBLIC_API cblas_htpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -627,12 +546,6 @@ void PUBLIC_API cblas_dger(const Layout layout, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hger(const Layout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU void PUBLIC_API cblas_cgeru(const Layout layout, @@ -725,11 +638,6 @@ void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, const double alpha, const double* x, const int x_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hsyr(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - void* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, @@ -742,11 +650,6 @@ void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, const double alpha, const double* x, const int x_inc, double* ap); -void PUBLIC_API cblas_hspr(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - void* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, @@ -761,12 +664,6 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hsyr2(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, @@ -781,12 +678,6 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* ap); -void PUBLIC_API cblas_hspr2(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -821,13 +712,6 @@ void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, co const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, @@ -858,13 +742,6 @@ void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsymm(const Layout layout, const Side side, const Triangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, @@ -907,12 +784,6 @@ void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const const void* a, const int a_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, @@ -957,13 +828,6 @@ void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, @@ -1002,11 +866,6 @@ void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_htrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1029,11 +888,6 @@ void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_htrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -1060,12 +914,6 @@ void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_homatcopy(const Layout layout, const Transpose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - half* b, const size_t b_offset, const size_t b_ld); // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index 81f093cd..72f50d83 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -117,11 +117,6 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; -// Precision scoped enum (values in bits) -typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32, - CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232, - CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision; - // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 220b314d..4ba97ff8 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,8 +41,8 @@ FILES = [ "/include/clblast_blas.h", "/src/clblast_blas.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 43, 1] -FOOTER_LINES = [17, 80, 19, 18, 6, 6, 10, 1] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 44, 32] +FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -64,65 +64,65 @@ cld_n = "The value of `c_ld` must be at least `n`." # Populates a list of routines ROUTINES = [ [ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []), ], [ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 61730fdb..23a2207c 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -99,7 +99,8 @@ def clblast_blas_h(routine): """The Netlib CBLAS API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: - result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL + if flavour.precision_name in ["S", "D", "C", "Z"]: + result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL return result @@ -107,31 +108,44 @@ def clblast_blas_cc(routine): """The Netlib CBLAS API implementation (.cpp)""" result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: - template = "<" + flavour.template + ">" if routine.no_scalars() else "" - indent = " " * (26 + routine.length() + len(template)) - result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL - - # Initialize OpenCL - result += " auto platform = Platform(size_t{0});" + NL - result += " auto device = Device(platform, size_t{0});" + NL - result += " auto context = Context(device);" + NL - result += " auto queue = Queue(context, device);" + NL - - # Copy data structures to the device - for name in routine.inputs + routine.outputs: - result += " " + routine.create_buffer(name, flavour.template, "0") + NL - for name in routine.inputs + routine.outputs: - result += " " + routine.write_buffer(name, "0") + NL - - # The function call - result += " auto status = clblast::" + routine.name.capitalize() + template + "(" - result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) - result += "," + NL + indent + "queue, event);" + NL - # Copy back and clean-up - for name in routine.outputs: - result += " " + routine.read_buffer(name, "0") + NL - result += " return;" + NL + "}" + NL + # There is a version available in CBLAS + if flavour.precision_name in ["S", "D", "C", "Z"]: + template = "<" + flavour.template + ">" if routine.no_scalars() else "" + indent = " " * (12 + routine.length() + len(template)) + result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL + + # Initialize OpenCL + result += " auto device = get_device();" + NL + result += " auto context = Context(device);" + NL + result += " auto queue = Queue(context, device);" + NL + + # Set alpha and beta + result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour)) + + # Copy data structures to the device + for i, name in enumerate(routine.inputs + routine.outputs): + result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL + result += " " + routine.create_buffer(name, flavour.buffer_type) + NL + for name in routine.inputs + routine.outputs: + prefix = "" if name in routine.outputs else "const " + result += " " + routine.write_buffer(name, prefix + flavour.buffer_type) + NL + + # The function call + result += " auto queue_cl = queue();" + NL + result += " auto s = " + routine.name.capitalize() + template + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)]) + result += "," + NL + indent + "&queue_cl);" + NL + + # Error handling + result += " if (s != StatusCode::kSuccess) {" + NL + result += " throw std::runtime_error(\"CLBlast returned with error code \" + ToString(s));" + NL + result += " }" + NL + + # Copy back and clean-up + for name in routine.outputs: + result += " " + routine.read_buffer(name, flavour.buffer_type) + NL + result += "}" + NL return result diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 01f32dd8..98874174 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -54,6 +54,22 @@ class DataType: return self.beta_cl + "{{beta.real(), beta.imag()}}" return "beta" + def use_alpha_clblast(self): + """Transforms a Netlib CBLAS parameter to CLBlast style""" + if self.alpha_cpp == D_FLOAT2: + return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" + elif self.alpha_cpp == D_DOUBLE2: + return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" + return "alpha" + + def use_beta_clblast(self): + """As above, but for beta instead of alpha""" + if self.beta_cpp == D_FLOAT2: + return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" + elif self.beta_cpp == D_DOUBLE2: + return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" + return "beta" + def test_template(self): """Returns the template as used in the correctness/performance tests""" if self.buffer_type != self.beta_cpp: diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 795fc532..b988c91a 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -13,7 +13,8 @@ import generator.convert as convert class Routine: """Class holding routine-specific information (e.g. name, which arguments, which precisions)""" def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, - inputs, outputs, scalars, scratch, description, details, requirements): + inputs, outputs, buffer_sizes, scalars, scratch, + description, details, requirements): self.implemented = implemented self.has_tests = has_tests self.level = level @@ -24,6 +25,7 @@ class Routine: self.options = options self.inputs = inputs self.outputs = outputs + self.buffer_sizes = buffer_sizes self.scalars = scalars self.scratch = scratch # Scratch buffer (e.g. for xDOT) self.description = description @@ -66,19 +68,26 @@ class Routine: return ["a", "b", "c", "ap"] @staticmethod - def create_buffer(name, template, size): + def set_size(name, size): + """Sets the size of a buffer""" + return "const auto " + name + "_size = " + size + ";" + + @staticmethod + def create_buffer(name, template): """Creates a new CLCudaAPI buffer""" - return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + size + ");" + return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + name + "_size);" @staticmethod - def write_buffer(name, size): + def write_buffer(name, template): """Writes to a CLCudaAPI buffer""" - return name + "_buffer.Write(queue, " + size + ", " + name + ");" + data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");" @staticmethod - def read_buffer(name, size): + def read_buffer(name, template): """Reads from a CLCudaAPI buffer""" - return name + "_buffer.Read(queue, " + size + ", " + name + ");" + data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");" def non_index_inputs(self): """Lists of input/output buffers not index (integer)""" @@ -148,6 +157,15 @@ class Routine: return [", ".join(a + b + c)] return [] + def buffer_zero_offset(self, name): + """As above, but with an offset value of zero""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer()"] + b = ["0"] + c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + def buffer_def(self, name): """As above but with data-types""" prefix = "const " if name in self.inputs else "" @@ -263,6 +281,12 @@ class Routine: return [name] return [] + def scalar_cpp(self, name): + """As above, but with _cpp as a suffix""" + if name in self.scalars: + return [name + "_cpp"] + return [] + def scalar_half_to_float(self, name): """As above, but converts from float to half""" if name in self.scalars: @@ -339,6 +363,16 @@ class Routine: return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."] return [] + def scalar_create_cpp(self, flavour): + """Creates a C++ version of a scalar based on a void*""" + result = [] + for name in self.scalars: + if name == "alpha": + result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";") + elif name == "beta": + result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";") + return result + def sizes_list(self): """Retrieves a list of comma-separated sizes (m, n, k)""" if self.sizes: @@ -469,6 +503,17 @@ class Routine: list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()]))) + def arguments_netlib(self, flavour, indent): + """As above, but for the Netlib CBLAS API""" + return (self.options_cast(indent) + self.sizes_list() + + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) + + self.scalar_cpp("alpha") + + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) + + self.scalar_cpp("beta") + + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + def arguments_wrapper_clblas(self, flavour): """As above, but for the clBLAS wrapper""" return (self.options_list() + self.sizes_list() + diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp new file mode 100644 index 00000000..286b1ba8 --- /dev/null +++ b/src/clblast_blas.cpp @@ -0,0 +1,4651 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer +// copies automatically and running on the default OpenCL platform and device. For full control over +// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. +// +// ================================================================================================= + +#include + +#include "clblast_blas.h" +#include "clblast.h" +#include "utilities/utilities.hpp" + +namespace clblast { + +// ================================================================================================= + +// Helper function to get a default OpenCL platform and device +Device get_device() { + auto platform_id = ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); + auto device_id = ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); + auto platform = Platform(platform_id); + return Device(platform, device_id); +} + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// ROTG +void cblas_srotg(float* sa, + float* sb, + float* sc, + float* ss) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sa_size = 1; + auto sa_buffer = Buffer(context, sa_size); + const auto sb_size = 1; + auto sb_buffer = Buffer(context, sb_size); + const auto sc_size = 1; + auto sc_buffer = Buffer(context, sc_size); + const auto ss_size = 1; + auto ss_buffer = Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} +void cblas_drotg(double* sa, + double* sb, + double* sc, + double* ss) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sa_size = 1; + auto sa_buffer = Buffer(context, sa_size); + const auto sb_size = 1; + auto sb_buffer = Buffer(context, sb_size); + const auto sc_size = 1; + auto sc_buffer = Buffer(context, sc_size); + const auto ss_size = 1; + auto ss_buffer = Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} + +// ROTMG +void cblas_srotmg(float* sd1, + float* sd2, + float* sx1, + const float* sy1, + float* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sy1_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + const auto sd1_size = 1; + auto sd1_buffer = Buffer(context, sd1_size); + const auto sd2_size = 1; + auto sd2_buffer = Buffer(context, sd2_size); + const auto sx1_size = 1; + auto sx1_buffer = Buffer(context, sx1_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotmg(double* sd1, + double* sd2, + double* sx1, + const double* sy1, + double* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sy1_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + const auto sd1_size = 1; + auto sd1_buffer = Buffer(context, sd1_size); + const auto sd2_size = 1; + auto sd2_buffer = Buffer(context, sd2_size); + const auto sx1_size = 1; + auto sx1_buffer = Buffer(context, sx1_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// ROT +void cblas_srot(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + const float cos, + const float sin) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_drot(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + const double cos, + const double sin) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// ROTM +void cblas_srotm(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + float* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotm(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + double* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// SWAP +void cblas_sswap(const int n, + float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dswap(const int n, + double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SCAL +void cblas_sscal(const int n, + const float alpha, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dscal(const int n, + const double alpha, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_cscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_zscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// COPY +void cblas_scopy(const int n, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dcopy(const int n, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_ccopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zcopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// AXPY +void cblas_saxpy(const int n, + const float alpha, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_daxpy(const int n, + const double alpha, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_caxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zaxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// DOT +void cblas_sdot(const int n, + float* dot, + const float* x, const int x_inc, + const float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_ddot(const int n, + double* dot, + const double* x, const int x_inc, + const double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTU +void cblas_cdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTC +void cblas_cdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// NRM2 +void cblas_snrm2(const int n, + float* nrm2, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dnrm2(const int n, + double* nrm2, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_scnrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dznrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} + +// ASUM +void cblas_sasum(const int n, + float* asum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dasum(const int n, + double* asum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_scasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dzasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} + +// SUM +void cblas_ssum(const int n, + float* sum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dsum(const int n, + double* sum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_scsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dzsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} + +// AMAX +void cblas_isamax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idamax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MAX +void cblas_ismax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idmax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MIN +void cblas_ismin(const int n, + float* imin, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_idmin(const int n, + double* imin, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_icmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_izmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// GEMV +void cblas_sgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// GBMV +void cblas_sgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HEMV +void cblas_chemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HBMV +void cblas_chbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HPMV +void cblas_chpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SYMV +void cblas_ssymv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsymv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SBMV +void cblas_ssbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SPMV +void cblas_sspmv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* ap, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dspmv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* ap, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// TRMV +void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBMV +void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPMV +void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TRSV +void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBSV +void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPSV +void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// GER +void cblas_sger(const Layout layout, + const int m, const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dger(const Layout layout, + const int m, const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERU +void cblas_cgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERC +void cblas_cgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HER +void cblas_cher(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR +void cblas_chpr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// HER2 +void cblas_cher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR2 +void cblas_chpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR +void cblas_ssyr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR +void cblas_sspr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR2 +void cblas_ssyr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR2 +void cblas_sspr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// GEMM +void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYMM +void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HEMM +void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYRK +void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HERK +void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYR2K +void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HER2K +void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// TRMM +void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// TRSM +void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// OMATCOPY +void cblas_somatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_domatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_comatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= +} // namespace clblast -- cgit v1.2.3