summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--include/clblast_blas.h158
-rw-r--r--include/clblast_c.h5
-rwxr-xr-xscripts/generator/generator.py108
-rw-r--r--scripts/generator/generator/cpp.py64
-rw-r--r--scripts/generator/generator/datatype.py16
-rw-r--r--scripts/generator/generator/routine.py59
-rw-r--r--src/clblast_blas.cpp4651
8 files changed, 4817 insertions, 246 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5edbd75..d2034617 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -163,6 +163,7 @@ set(PRECISIONS 32 64 3232 6464 16)
# Gathers all source-files
set(SOURCES
+ src/clblast_blas.cpp
src/database/database.cpp
src/routines/common.cpp
src/utilities/clblast_exceptions.cpp
@@ -213,6 +214,7 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)
install(FILES include/clblast_half.h DESTINATION include)
+install(FILES include/clblast_blas.h DESTINATION include)
# Installs the config for find_package in dependent projects
install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
diff --git a/include/clblast_blas.h b/include/clblast_blas.h
index a5d0cc9c..b4db4192 100644
--- a/include/clblast_blas.h
+++ b/include/clblast_blas.h
@@ -18,8 +18,8 @@
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
- #ifdef COMPILING_DLL
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+ #if defined(COMPILING_DLL)
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API __declspec(dllimport)
@@ -42,6 +42,7 @@ typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle;
typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal;
typedef enum Side_ { kLeft = 141, kRight = 142 } Side;
+
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================
@@ -103,9 +104,6 @@ void PUBLIC_API cblas_cswap(const int n,
void PUBLIC_API cblas_zswap(const int n,
void* x, const int x_inc,
void* y, const int y_inc);
-void PUBLIC_API cblas_hswap(const int n,
- void* x, const int x_inc,
- void* y, const int y_inc);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
void PUBLIC_API cblas_sscal(const int n,
@@ -120,9 +118,6 @@ void PUBLIC_API cblas_cscal(const int n,
void PUBLIC_API cblas_zscal(const int n,
const void* alpha,
void* x, const int x_inc);
-void PUBLIC_API cblas_hscal(const int n,
- const void* alpha,
- void* x, const int x_inc);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
void PUBLIC_API cblas_scopy(const int n,
@@ -137,9 +132,6 @@ void PUBLIC_API cblas_ccopy(const int n,
void PUBLIC_API cblas_zcopy(const int n,
const void* x, const int x_inc,
void* y, const int y_inc);
-void PUBLIC_API cblas_hcopy(const int n,
- const void* x, const int x_inc,
- void* y, const int y_inc);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
void PUBLIC_API cblas_saxpy(const int n,
@@ -158,10 +150,6 @@ void PUBLIC_API cblas_zaxpy(const int n,
const void* alpha,
const void* x, const int x_inc,
void* y, const int y_inc);
-void PUBLIC_API cblas_haxpy(const int n,
- const void* alpha,
- const void* x, const int x_inc,
- void* y, const int y_inc);
// Dot product of two vectors: SDOT/DDOT/HDOT
void PUBLIC_API cblas_sdot(const int n,
@@ -172,10 +160,6 @@ void PUBLIC_API cblas_ddot(const int n,
double* dot,
const double* x, const int x_inc,
const double* y, const int y_inc);
-void PUBLIC_API cblas_hdot(const int n,
- void* dot,
- const void* x, const int x_inc,
- const void* y, const int y_inc);
// Dot product of two complex vectors: CDOTU/ZDOTU
void PUBLIC_API cblas_cdotu(const int n,
@@ -210,9 +194,6 @@ void PUBLIC_API cblas_scnrm2(const int n,
void PUBLIC_API cblas_dznrm2(const int n,
void* nrm2,
const void* x, const int x_inc);
-void PUBLIC_API cblas_hnrm2(const int n,
- void* nrm2,
- const void* x, const int x_inc);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
void PUBLIC_API cblas_sasum(const int n,
@@ -227,9 +208,6 @@ void PUBLIC_API cblas_scasum(const int n,
void PUBLIC_API cblas_dzasum(const int n,
void* asum,
const void* x, const int x_inc);
-void PUBLIC_API cblas_hasum(const int n,
- void* asum,
- const void* x, const int x_inc);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
void PUBLIC_API cblas_ssum(const int n,
@@ -244,9 +222,6 @@ void PUBLIC_API cblas_scsum(const int n,
void PUBLIC_API cblas_dzsum(const int n,
void* sum,
const void* x, const int x_inc);
-void PUBLIC_API cblas_hsum(const int n,
- void* sum,
- const void* x, const int x_inc);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
void PUBLIC_API cblas_isamax(const int n,
@@ -261,9 +236,6 @@ void PUBLIC_API cblas_icamax(const int n,
void PUBLIC_API cblas_izamax(const int n,
void* imax,
const void* x, const int x_inc);
-void PUBLIC_API cblas_ihamax(const int n,
- void* imax,
- const void* x, const int x_inc);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
void PUBLIC_API cblas_ismax(const int n,
@@ -278,9 +250,6 @@ void PUBLIC_API cblas_icmax(const int n,
void PUBLIC_API cblas_izmax(const int n,
void* imax,
const void* x, const int x_inc);
-void PUBLIC_API cblas_ihmax(const int n,
- void* imax,
- const void* x, const int x_inc);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
void PUBLIC_API cblas_ismin(const int n,
@@ -295,9 +264,6 @@ void PUBLIC_API cblas_icmin(const int n,
void PUBLIC_API cblas_izmin(const int n,
void* imin,
const void* x, const int x_inc);
-void PUBLIC_API cblas_ihmin(const int n,
- void* imin,
- const void* x, const int x_inc);
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
@@ -332,13 +298,6 @@ void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
-void PUBLIC_API cblas_hgemv(const Layout layout, const Transpose a_transpose,
- const int m, const int n,
- const void* alpha,
- const void* a, const int a_ld,
- const void* x, const int x_inc,
- const void* beta,
- void* y, const int y_inc);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose,
@@ -369,13 +328,6 @@ void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
-void PUBLIC_API cblas_hgbmv(const Layout layout, const Transpose a_transpose,
- const int m, const int n, const int kl, const int ku,
- const void* alpha,
- const void* a, const int a_ld,
- const void* x, const int x_inc,
- const void* beta,
- void* y, const int y_inc);
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle,
@@ -440,13 +392,6 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
-void PUBLIC_API cblas_hsymv(const Layout layout, const Triangle triangle,
- const int n,
- const void* alpha,
- const void* a, const int a_ld,
- const void* x, const int x_inc,
- const void* beta,
- void* y, const int y_inc);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle,
@@ -463,13 +408,6 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
-void PUBLIC_API cblas_hsbmv(const Layout layout, const Triangle triangle,
- const int n, const int k,
- const void* alpha,
- const void* a, const int a_ld,
- const void* x, const int x_inc,
- const void* beta,
- void* y, const int y_inc);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle,
@@ -486,13 +424,6 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
-void PUBLIC_API cblas_hspmv(const Layout layout, const Triangle triangle,
- const int n,
- const void* alpha,
- const void* ap,
- const void* x, const int x_inc,
- const void* beta,
- void* y, const int y_inc);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -511,10 +442,6 @@ void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const
const int n,
const void* a, const int a_ld,
void* x, const int x_inc);
-void PUBLIC_API cblas_htrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const int n,
- const void* a, const int a_ld,
- void* x, const int x_inc);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -533,10 +460,6 @@ void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc);
-void PUBLIC_API cblas_htbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const int n, const int k,
- const void* a, const int a_ld,
- void* x, const int x_inc);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -555,10 +478,6 @@ void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const
const int n,
const void* ap,
void* x, const int x_inc);
-void PUBLIC_API cblas_htpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const int n,
- const void* ap,
- void* x, const int x_inc);
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -627,12 +546,6 @@ void PUBLIC_API cblas_dger(const Layout layout,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld);
-void PUBLIC_API cblas_hger(const Layout layout,
- const int m, const int n,
- const void* alpha,
- const void* x, const int x_inc,
- const void* y, const int y_inc,
- void* a, const int a_ld);
// General rank-1 complex matrix update: CGERU/ZGERU
void PUBLIC_API cblas_cgeru(const Layout layout,
@@ -725,11 +638,6 @@ void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle,
const double alpha,
const double* x, const int x_inc,
double* a, const int a_ld);
-void PUBLIC_API cblas_hsyr(const Layout layout, const Triangle triangle,
- const int n,
- const void* alpha,
- const void* x, const int x_inc,
- void* a, const int a_ld);
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle,
@@ -742,11 +650,6 @@ void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle,
const double alpha,
const double* x, const int x_inc,
double* ap);
-void PUBLIC_API cblas_hspr(const Layout layout, const Triangle triangle,
- const int n,
- const void* alpha,
- const void* x, const int x_inc,
- void* ap);
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle,
@@ -761,12 +664,6 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld);
-void PUBLIC_API cblas_hsyr2(const Layout layout, const Triangle triangle,
- const int n,
- const void* alpha,
- const void* x, const int x_inc,
- const void* y, const int y_inc,
- void* a, const int a_ld);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle,
@@ -781,12 +678,6 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* ap);
-void PUBLIC_API cblas_hspr2(const Layout layout, const Triangle triangle,
- const int n,
- const void* alpha,
- const void* x, const int x_inc,
- const void* y, const int y_inc,
- void* ap);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
@@ -821,13 +712,6 @@ void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, co
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
-void PUBLIC_API cblas_hgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const int m, const int n, const int k,
- const void* alpha,
- const void* a, const int a_ld,
- const void* b, const int b_ld,
- const void* beta,
- void* c, const int c_ld);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle,
@@ -858,13 +742,6 @@ void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
-void PUBLIC_API cblas_hsymm(const Layout layout, const Side side, const Triangle triangle,
- const int m, const int n,
- const void* alpha,
- const void* a, const int a_ld,
- const void* b, const int b_ld,
- const void* beta,
- void* c, const int c_ld);
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle,
@@ -907,12 +784,6 @@ void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const
const void* a, const int a_ld,
const void* beta,
void* c, const int c_ld);
-void PUBLIC_API cblas_hsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const int n, const int k,
- const void* alpha,
- const void* a, const int a_ld,
- const void* beta,
- void* c, const int c_ld);
// Rank-K update of a hermitian matrix: CHERK/ZHERK
void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
@@ -957,13 +828,6 @@ void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
-void PUBLIC_API cblas_hsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const int n, const int k,
- const void* alpha,
- const void* a, const int a_ld,
- const void* b, const int b_ld,
- const void* beta,
- void* c, const int c_ld);
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
@@ -1002,11 +866,6 @@ void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
-void PUBLIC_API cblas_htrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const int m, const int n,
- const void* alpha,
- const void* a, const int a_ld,
- void* b, const int b_ld);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -1029,11 +888,6 @@ void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
-void PUBLIC_API cblas_htrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
- const int m, const int n,
- const void* alpha,
- const void* a, const int a_ld,
- void* b, const int b_ld);
// =================================================================================================
// Extra non-BLAS routines (level-X)
@@ -1060,12 +914,6 @@ void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
-void PUBLIC_API cblas_homatcopy(const Layout layout, const Transpose a_transpose,
- const int m, const int n,
- const void* alpha,
- const void* a, const int a_ld,
- void* b, const int b_ld);
- half* b, const size_t b_offset, const size_t b_ld);
// =================================================================================================
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 81f093cd..72f50d83 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -117,11 +117,6 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
-// Precision scoped enum (values in bits)
-typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
- CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
- CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
-
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 220b314d..4ba97ff8 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -41,8 +41,8 @@ FILES = [
"/include/clblast_blas.h",
"/src/clblast_blas.cpp",
]
-HEADER_LINES = [117, 73, 118, 22, 29, 41, 43, 1]
-FOOTER_LINES = [17, 80, 19, 18, 6, 6, 10, 1]
+HEADER_LINES = [117, 73, 118, 22, 29, 41, 44, 32]
+FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
@@ -64,65 +64,65 @@ cld_n = "The value of `c_ld` must be at least `n`."
# Populates a list of routines
ROUTINES = [
[ # Level 1: vector-vector
- Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
- Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
- Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
- Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
- Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
- Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
- Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
- Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
- Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
- Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
- Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
- Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
- Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
- Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
- Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
- Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
- Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+ Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []),
+ Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []),
+ Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
+ Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []),
+ Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+ Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+ Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+ Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+ Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+ Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+ Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+ Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+ Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+ Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+ Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+ Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+ Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
- Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
- Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
- Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
- Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
- Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
- Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
- Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
- Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
- Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
- Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
- Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
- Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
+ Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+ Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+ Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+ Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+ Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+ Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+ Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+ Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+ Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+ Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []),
+ Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
+ Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
- Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
- Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
- Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
- Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
- Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
- Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
- Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
- Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+ Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+ Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+ Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+ Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+ Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+ Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+ Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
],
[ # Level 3: matrix-matrix
- Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
- Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
- Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
- Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
- Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
- Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
- Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
- Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
- Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
+ Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+ Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+ Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+ Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+ Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+ Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+ Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+ Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+ Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []),
],
[ # Level X: extra routines (not part of BLAS)
- Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+ Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
]]
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index 61730fdb..23a2207c 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -99,7 +99,8 @@ def clblast_blas_h(routine):
"""The Netlib CBLAS API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
for flavour in routine.flavours:
- result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL
+ if flavour.precision_name in ["S", "D", "C", "Z"]:
+ result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL
return result
@@ -107,31 +108,44 @@ def clblast_blas_cc(routine):
"""The Netlib CBLAS API implementation (.cpp)"""
result = NL + "// " + routine.name.upper() + NL
for flavour in routine.flavours:
- template = "<" + flavour.template + ">" if routine.no_scalars() else ""
- indent = " " * (26 + routine.length() + len(template))
- result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL
-
- # Initialize OpenCL
- result += " auto platform = Platform(size_t{0});" + NL
- result += " auto device = Device(platform, size_t{0});" + NL
- result += " auto context = Context(device);" + NL
- result += " auto queue = Queue(context, device);" + NL
-
- # Copy data structures to the device
- for name in routine.inputs + routine.outputs:
- result += " " + routine.create_buffer(name, flavour.template, "0") + NL
- for name in routine.inputs + routine.outputs:
- result += " " + routine.write_buffer(name, "0") + NL
-
- # The function call
- result += " auto status = clblast::" + routine.name.capitalize() + template + "("
- result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
- result += "," + NL + indent + "queue, event);" + NL
- # Copy back and clean-up
- for name in routine.outputs:
- result += " " + routine.read_buffer(name, "0") + NL
- result += " return;" + NL + "}" + NL
+ # There is a version available in CBLAS
+ if flavour.precision_name in ["S", "D", "C", "Z"]:
+ template = "<" + flavour.template + ">" if routine.no_scalars() else ""
+ indent = " " * (12 + routine.length() + len(template))
+ result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL
+
+ # Initialize OpenCL
+ result += " auto device = get_device();" + NL
+ result += " auto context = Context(device);" + NL
+ result += " auto queue = Queue(context, device);" + NL
+
+ # Set alpha and beta
+ result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour))
+
+ # Copy data structures to the device
+ for i, name in enumerate(routine.inputs + routine.outputs):
+ result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL
+ result += " " + routine.create_buffer(name, flavour.buffer_type) + NL
+ for name in routine.inputs + routine.outputs:
+ prefix = "" if name in routine.outputs else "const "
+ result += " " + routine.write_buffer(name, prefix + flavour.buffer_type) + NL
+
+ # The function call
+ result += " auto queue_cl = queue();" + NL
+ result += " auto s = " + routine.name.capitalize() + template + "("
+ result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)])
+ result += "," + NL + indent + "&queue_cl);" + NL
+
+ # Error handling
+ result += " if (s != StatusCode::kSuccess) {" + NL
+ result += " throw std::runtime_error(\"CLBlast returned with error code \" + ToString(s));" + NL
+ result += " }" + NL
+
+ # Copy back and clean-up
+ for name in routine.outputs:
+ result += " " + routine.read_buffer(name, flavour.buffer_type) + NL
+ result += "}" + NL
return result
diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py
index 01f32dd8..98874174 100644
--- a/scripts/generator/generator/datatype.py
+++ b/scripts/generator/generator/datatype.py
@@ -54,6 +54,22 @@ class DataType:
return self.beta_cl + "{{beta.real(), beta.imag()}}"
return "beta"
+ def use_alpha_clblast(self):
+ """Transforms a Netlib CBLAS parameter to CLBlast style"""
+ if self.alpha_cpp == D_FLOAT2:
+ return self.alpha_cpp + "{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}"
+ elif self.alpha_cpp == D_DOUBLE2:
+ return self.alpha_cpp + "{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}"
+ return "alpha"
+
+ def use_beta_clblast(self):
+ """As above, but for beta instead of alpha"""
+ if self.beta_cpp == D_FLOAT2:
+ return self.beta_cpp + "{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}"
+ elif self.beta_cpp == D_DOUBLE2:
+ return self.beta_cpp + "{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}"
+ return "beta"
+
def test_template(self):
"""Returns the template as used in the correctness/performance tests"""
if self.buffer_type != self.beta_cpp:
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 795fc532..b988c91a 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -13,7 +13,8 @@ import generator.convert as convert
class Routine:
"""Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
- inputs, outputs, scalars, scratch, description, details, requirements):
+ inputs, outputs, buffer_sizes, scalars, scratch,
+ description, details, requirements):
self.implemented = implemented
self.has_tests = has_tests
self.level = level
@@ -24,6 +25,7 @@ class Routine:
self.options = options
self.inputs = inputs
self.outputs = outputs
+ self.buffer_sizes = buffer_sizes
self.scalars = scalars
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
self.description = description
@@ -66,19 +68,26 @@ class Routine:
return ["a", "b", "c", "ap"]
@staticmethod
- def create_buffer(name, template, size):
+ def set_size(name, size):
+ """Sets the size of a buffer"""
+ return "const auto " + name + "_size = " + size + ";"
+
+ @staticmethod
+ def create_buffer(name, template):
"""Creates a new CLCudaAPI buffer"""
- return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + size + ");"
+ return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + name + "_size);"
@staticmethod
- def write_buffer(name, size):
+ def write_buffer(name, template):
"""Writes to a CLCudaAPI buffer"""
- return name + "_buffer.Write(queue, " + size + ", " + name + ");"
+ data_structure = "reinterpret_cast<" + template + "*>(" + name + ")"
+ return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");"
@staticmethod
- def read_buffer(name, size):
+ def read_buffer(name, template):
"""Reads from a CLCudaAPI buffer"""
- return name + "_buffer.Read(queue, " + size + ", " + name + ");"
+ data_structure = "reinterpret_cast<" + template + "*>(" + name + ")"
+ return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");"
def non_index_inputs(self):
"""Lists of input/output buffers not index (integer)"""
@@ -148,6 +157,15 @@ class Routine:
return [", ".join(a + b + c)]
return []
+ def buffer_zero_offset(self, name):
+ """As above, but with an offset value of zero"""
+ if name in self.inputs or name in self.outputs:
+ a = [name + "_buffer()"]
+ b = ["0"]
+ c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
+ return [", ".join(a + b + c)]
+ return []
+
def buffer_def(self, name):
"""As above but with data-types"""
prefix = "const " if name in self.inputs else ""
@@ -263,6 +281,12 @@ class Routine:
return [name]
return []
+ def scalar_cpp(self, name):
+ """As above, but with _cpp as a suffix"""
+ if name in self.scalars:
+ return [name + "_cpp"]
+ return []
+
def scalar_half_to_float(self, name):
"""As above, but converts from float to half"""
if name in self.scalars:
@@ -339,6 +363,16 @@ class Routine:
return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
return []
+ def scalar_create_cpp(self, flavour):
+ """Creates a C++ version of a scalar based on a void*"""
+ result = []
+ for name in self.scalars:
+ if name == "alpha":
+ result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";")
+ elif name == "beta":
+ result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";")
+ return result
+
def sizes_list(self):
"""Retrieves a list of comma-separated sizes (m, n, k)"""
if self.sizes:
@@ -469,6 +503,17 @@ class Routine:
list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
+ def arguments_netlib(self, flavour, indent):
+ """As above, but for the Netlib CBLAS API"""
+ return (self.options_cast(indent) + self.sizes_list() +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) +
+ self.scalar_cpp("alpha") +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) +
+ self.scalar_cpp("beta") +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) +
+ list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+
def arguments_wrapper_clblas(self, flavour):
"""As above, but for the clBLAS wrapper"""
return (self.options_list() + self.sizes_list() +
diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp
new file mode 100644
index 00000000..286b1ba8
--- /dev/null
+++ b/src/clblast_blas.cpp
@@ -0,0 +1,4651 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer
+// copies automatically and running on the default OpenCL platform and device. For full control over
+// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
+//
+// =================================================================================================
+
+#include <cstdlib>
+
+#include "clblast_blas.h"
+#include "clblast.h"
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+
+// =================================================================================================
+
+// Helper function to get a default OpenCL platform and device
+Device get_device() {
+ auto platform_id = ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0});
+ auto device_id = ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0});
+ auto platform = Platform(platform_id);
+ return Device(platform, device_id);
+}
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// ROTG
+void cblas_srotg(float* sa,
+ float* sb,
+ float* sc,
+ float* ss) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto sa_size = 1;
+ auto sa_buffer = Buffer<float>(context, sa_size);
+ const auto sb_size = 1;
+ auto sb_buffer = Buffer<float>(context, sb_size);
+ const auto sc_size = 1;
+ auto sc_buffer = Buffer<float>(context, sc_size);
+ const auto ss_size = 1;
+ auto ss_buffer = Buffer<float>(context, ss_size);
+ sa_buffer.Write(queue, sa_size, reinterpret_cast<float*>(sa));
+ sb_buffer.Write(queue, sb_size, reinterpret_cast<float*>(sb));
+ sc_buffer.Write(queue, sc_size, reinterpret_cast<float*>(sc));
+ ss_buffer.Write(queue, ss_size, reinterpret_cast<float*>(ss));
+ auto queue_cl = queue();
+ auto s = Rotg<float>(sa_buffer(), 0,
+ sb_buffer(), 0,
+ sc_buffer(), 0,
+ ss_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sa_buffer.Read(queue, sa_size, reinterpret_cast<float*>(sa));
+ sb_buffer.Read(queue, sb_size, reinterpret_cast<float*>(sb));
+ sc_buffer.Read(queue, sc_size, reinterpret_cast<float*>(sc));
+ ss_buffer.Read(queue, ss_size, reinterpret_cast<float*>(ss));
+}
+void cblas_drotg(double* sa,
+ double* sb,
+ double* sc,
+ double* ss) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto sa_size = 1;
+ auto sa_buffer = Buffer<double>(context, sa_size);
+ const auto sb_size = 1;
+ auto sb_buffer = Buffer<double>(context, sb_size);
+ const auto sc_size = 1;
+ auto sc_buffer = Buffer<double>(context, sc_size);
+ const auto ss_size = 1;
+ auto ss_buffer = Buffer<double>(context, ss_size);
+ sa_buffer.Write(queue, sa_size, reinterpret_cast<double*>(sa));
+ sb_buffer.Write(queue, sb_size, reinterpret_cast<double*>(sb));
+ sc_buffer.Write(queue, sc_size, reinterpret_cast<double*>(sc));
+ ss_buffer.Write(queue, ss_size, reinterpret_cast<double*>(ss));
+ auto queue_cl = queue();
+ auto s = Rotg<double>(sa_buffer(), 0,
+ sb_buffer(), 0,
+ sc_buffer(), 0,
+ ss_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sa_buffer.Read(queue, sa_size, reinterpret_cast<double*>(sa));
+ sb_buffer.Read(queue, sb_size, reinterpret_cast<double*>(sb));
+ sc_buffer.Read(queue, sc_size, reinterpret_cast<double*>(sc));
+ ss_buffer.Read(queue, ss_size, reinterpret_cast<double*>(ss));
+}
+
+// ROTMG
+void cblas_srotmg(float* sd1,
+ float* sd2,
+ float* sx1,
+ const float* sy1,
+ float* sparam) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto sy1_size = 1;
+ auto sy1_buffer = Buffer<float>(context, sy1_size);
+ const auto sd1_size = 1;
+ auto sd1_buffer = Buffer<float>(context, sd1_size);
+ const auto sd2_size = 1;
+ auto sd2_buffer = Buffer<float>(context, sd2_size);
+ const auto sx1_size = 1;
+ auto sx1_buffer = Buffer<float>(context, sx1_size);
+ const auto sparam_size = 1;
+ auto sparam_buffer = Buffer<float>(context, sparam_size);
+ sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const float*>(sy1));
+ sd1_buffer.Write(queue, sd1_size, reinterpret_cast<float*>(sd1));
+ sd2_buffer.Write(queue, sd2_size, reinterpret_cast<float*>(sd2));
+ sx1_buffer.Write(queue, sx1_size, reinterpret_cast<float*>(sx1));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<float*>(sparam));
+ auto queue_cl = queue();
+ auto s = Rotmg<float>(sd1_buffer(), 0,
+ sd2_buffer(), 0,
+ sx1_buffer(), 0,
+ sy1_buffer(), 0,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sd1_buffer.Read(queue, sd1_size, reinterpret_cast<float*>(sd1));
+ sd2_buffer.Read(queue, sd2_size, reinterpret_cast<float*>(sd2));
+ sx1_buffer.Read(queue, sx1_size, reinterpret_cast<float*>(sx1));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<float*>(sparam));
+}
+void cblas_drotmg(double* sd1,
+ double* sd2,
+ double* sx1,
+ const double* sy1,
+ double* sparam) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto sy1_size = 1;
+ auto sy1_buffer = Buffer<double>(context, sy1_size);
+ const auto sd1_size = 1;
+ auto sd1_buffer = Buffer<double>(context, sd1_size);
+ const auto sd2_size = 1;
+ auto sd2_buffer = Buffer<double>(context, sd2_size);
+ const auto sx1_size = 1;
+ auto sx1_buffer = Buffer<double>(context, sx1_size);
+ const auto sparam_size = 1;
+ auto sparam_buffer = Buffer<double>(context, sparam_size);
+ sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const double*>(sy1));
+ sd1_buffer.Write(queue, sd1_size, reinterpret_cast<double*>(sd1));
+ sd2_buffer.Write(queue, sd2_size, reinterpret_cast<double*>(sd2));
+ sx1_buffer.Write(queue, sx1_size, reinterpret_cast<double*>(sx1));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<double*>(sparam));
+ auto queue_cl = queue();
+ auto s = Rotmg<double>(sd1_buffer(), 0,
+ sd2_buffer(), 0,
+ sx1_buffer(), 0,
+ sy1_buffer(), 0,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sd1_buffer.Read(queue, sd1_size, reinterpret_cast<double*>(sd1));
+ sd2_buffer.Read(queue, sd2_size, reinterpret_cast<double*>(sd2));
+ sx1_buffer.Read(queue, sx1_size, reinterpret_cast<double*>(sx1));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<double*>(sparam));
+}
+
+// ROT
+void cblas_srot(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ const float cos,
+ const float sin) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Rot(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ cos,
+ sin,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_drot(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ const double cos,
+ const double sin) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Rot(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ cos,
+ sin,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// ROTM
+void cblas_srotm(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ float* sparam) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ const auto sparam_size = 1;
+ auto sparam_buffer = Buffer<float>(context, sparam_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<float*>(sparam));
+ auto queue_cl = queue();
+ auto s = Rotm<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<float*>(sparam));
+}
+void cblas_drotm(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ double* sparam) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ const auto sparam_size = 1;
+ auto sparam_buffer = Buffer<double>(context, sparam_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<double*>(sparam));
+ auto queue_cl = queue();
+ auto s = Rotm<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<double*>(sparam));
+}
+
+// SWAP
+void cblas_sswap(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Swap<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dswap(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Swap<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Swap<float2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Swap<double2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// SCAL
+void cblas_sscal(const int n,
+ const float alpha,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dscal(const int n,
+ const double alpha,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_cscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_zscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// COPY
+void cblas_scopy(const int n,
+ const float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Copy<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dcopy(const int n,
+ const double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Copy<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_ccopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Copy<float2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zcopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Copy<double2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// AXPY
+void cblas_saxpy(const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_daxpy(const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_caxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zaxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// DOT
+void cblas_sdot(const int n,
+ float* dot,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ const auto dot_size = 1;
+ auto dot_buffer = Buffer<float>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ dot_buffer.Write(queue, dot_size, reinterpret_cast<float*>(dot));
+ auto queue_cl = queue();
+ auto s = Dot<float>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float*>(dot));
+}
+void cblas_ddot(const int n,
+ double* dot,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ const auto dot_size = 1;
+ auto dot_buffer = Buffer<double>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ dot_buffer.Write(queue, dot_size, reinterpret_cast<double*>(dot));
+ auto queue_cl = queue();
+ auto s = Dot<double>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double*>(dot));
+}
+
+// DOTU
+void cblas_cdotu(const int n,
+ void* dot,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ const auto dot_size = 1;
+ auto dot_buffer = Buffer<float2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ dot_buffer.Write(queue, dot_size, reinterpret_cast<float2*>(dot));
+ auto queue_cl = queue();
+ auto s = Dotu<float2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
+}
+void cblas_zdotu(const int n,
+ void* dot,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ const auto dot_size = 1;
+ auto dot_buffer = Buffer<double2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ dot_buffer.Write(queue, dot_size, reinterpret_cast<double2*>(dot));
+ auto queue_cl = queue();
+ auto s = Dotu<double2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
+}
+
+// DOTC
+void cblas_cdotc(const int n,
+ void* dot,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ const auto dot_size = 1;
+ auto dot_buffer = Buffer<float2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ dot_buffer.Write(queue, dot_size, reinterpret_cast<float2*>(dot));
+ auto queue_cl = queue();
+ auto s = Dotc<float2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
+}
+void cblas_zdotc(const int n,
+ void* dot,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ const auto dot_size = 1;
+ auto dot_buffer = Buffer<double2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ dot_buffer.Write(queue, dot_size, reinterpret_cast<double2*>(dot));
+ auto queue_cl = queue();
+ auto s = Dotc<double2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
+}
+
+// NRM2
+void cblas_snrm2(const int n,
+ float* nrm2,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto nrm2_size = 1;
+ auto nrm2_buffer = Buffer<float>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast<float*>(nrm2));
+ auto queue_cl = queue();
+ auto s = Nrm2<float>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<float*>(nrm2));
+}
+void cblas_dnrm2(const int n,
+ double* nrm2,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto nrm2_size = 1;
+ auto nrm2_buffer = Buffer<double>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast<double*>(nrm2));
+ auto queue_cl = queue();
+ auto s = Nrm2<double>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<double*>(nrm2));
+}
+void cblas_scnrm2(const int n,
+ void* nrm2,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto nrm2_size = 1;
+ auto nrm2_buffer = Buffer<float2>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast<float2*>(nrm2));
+ auto queue_cl = queue();
+ auto s = Nrm2<float2>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<float2*>(nrm2));
+}
+void cblas_dznrm2(const int n,
+ void* nrm2,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto nrm2_size = 1;
+ auto nrm2_buffer = Buffer<double2>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast<double2*>(nrm2));
+ auto queue_cl = queue();
+ auto s = Nrm2<double2>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<double2*>(nrm2));
+}
+
+// ASUM
+void cblas_sasum(const int n,
+ float* asum,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto asum_size = 1;
+ auto asum_buffer = Buffer<float>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ asum_buffer.Write(queue, asum_size, reinterpret_cast<float*>(asum));
+ auto queue_cl = queue();
+ auto s = Asum<float>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<float*>(asum));
+}
+void cblas_dasum(const int n,
+ double* asum,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto asum_size = 1;
+ auto asum_buffer = Buffer<double>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ asum_buffer.Write(queue, asum_size, reinterpret_cast<double*>(asum));
+ auto queue_cl = queue();
+ auto s = Asum<double>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<double*>(asum));
+}
+void cblas_scasum(const int n,
+ void* asum,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto asum_size = 1;
+ auto asum_buffer = Buffer<float2>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ asum_buffer.Write(queue, asum_size, reinterpret_cast<float2*>(asum));
+ auto queue_cl = queue();
+ auto s = Asum<float2>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<float2*>(asum));
+}
+void cblas_dzasum(const int n,
+ void* asum,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto asum_size = 1;
+ auto asum_buffer = Buffer<double2>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ asum_buffer.Write(queue, asum_size, reinterpret_cast<double2*>(asum));
+ auto queue_cl = queue();
+ auto s = Asum<double2>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<double2*>(asum));
+}
+
+// SUM
+void cblas_ssum(const int n,
+ float* sum,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto sum_size = 1;
+ auto sum_buffer = Buffer<float>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ sum_buffer.Write(queue, sum_size, reinterpret_cast<float*>(sum));
+ auto queue_cl = queue();
+ auto s = Sum<float>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<float*>(sum));
+}
+void cblas_dsum(const int n,
+ double* sum,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto sum_size = 1;
+ auto sum_buffer = Buffer<double>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ sum_buffer.Write(queue, sum_size, reinterpret_cast<double*>(sum));
+ auto queue_cl = queue();
+ auto s = Sum<double>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<double*>(sum));
+}
+void cblas_scsum(const int n,
+ void* sum,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto sum_size = 1;
+ auto sum_buffer = Buffer<float2>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ sum_buffer.Write(queue, sum_size, reinterpret_cast<float2*>(sum));
+ auto queue_cl = queue();
+ auto s = Sum<float2>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<float2*>(sum));
+}
+void cblas_dzsum(const int n,
+ void* sum,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto sum_size = 1;
+ auto sum_buffer = Buffer<double2>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ sum_buffer.Write(queue, sum_size, reinterpret_cast<double2*>(sum));
+ auto queue_cl = queue();
+ auto s = Sum<double2>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<double2*>(sum));
+}
+
+// AMAX
+void cblas_isamax(const int n,
+ float* imax,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<float>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<float*>(imax));
+ auto queue_cl = queue();
+ auto s = Amax<float>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<float*>(imax));
+}
+void cblas_idamax(const int n,
+ double* imax,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<double>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<double*>(imax));
+ auto queue_cl = queue();
+ auto s = Amax<double>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<double*>(imax));
+}
+void cblas_icamax(const int n,
+ void* imax,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<float2>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<float2*>(imax));
+ auto queue_cl = queue();
+ auto s = Amax<float2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<float2*>(imax));
+}
+void cblas_izamax(const int n,
+ void* imax,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<double2>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<double2*>(imax));
+ auto queue_cl = queue();
+ auto s = Amax<double2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<double2*>(imax));
+}
+
+// MAX
+void cblas_ismax(const int n,
+ float* imax,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<float>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<float*>(imax));
+ auto queue_cl = queue();
+ auto s = Max<float>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<float*>(imax));
+}
+void cblas_idmax(const int n,
+ double* imax,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<double>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<double*>(imax));
+ auto queue_cl = queue();
+ auto s = Max<double>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<double*>(imax));
+}
+void cblas_icmax(const int n,
+ void* imax,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<float2>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<float2*>(imax));
+ auto queue_cl = queue();
+ auto s = Max<float2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<float2*>(imax));
+}
+void cblas_izmax(const int n,
+ void* imax,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto imax_size = 1;
+ auto imax_buffer = Buffer<double2>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ imax_buffer.Write(queue, imax_size, reinterpret_cast<double2*>(imax));
+ auto queue_cl = queue();
+ auto s = Max<double2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<double2*>(imax));
+}
+
+// MIN
+void cblas_ismin(const int n,
+ float* imin,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto imin_size = 1;
+ auto imin_buffer = Buffer<float>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ imin_buffer.Write(queue, imin_size, reinterpret_cast<float*>(imin));
+ auto queue_cl = queue();
+ auto s = Min<float>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<float*>(imin));
+}
+void cblas_idmin(const int n,
+ double* imin,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto imin_size = 1;
+ auto imin_buffer = Buffer<double>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ imin_buffer.Write(queue, imin_size, reinterpret_cast<double*>(imin));
+ auto queue_cl = queue();
+ auto s = Min<double>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<double*>(imin));
+}
+void cblas_icmin(const int n,
+ void* imin,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto imin_size = 1;
+ auto imin_buffer = Buffer<float2>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ imin_buffer.Write(queue, imin_size, reinterpret_cast<float2*>(imin));
+ auto queue_cl = queue();
+ auto s = Min<float2>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<float2*>(imin));
+}
+void cblas_izmin(const int n,
+ void* imin,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto imin_size = 1;
+ auto imin_buffer = Buffer<double2>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ imin_buffer.Write(queue, imin_size, reinterpret_cast<double2*>(imin));
+ auto queue_cl = queue();
+ auto s = Min<double2>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<double2*>(imin));
+}
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// GEMV
+void cblas_sgemv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dgemv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cgemv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zgemv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// GBMV
+void cblas_sgbmv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dgbmv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cgbmv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zgbmv(const Layout layout, const Transpose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HEMV
+void cblas_chemv(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhemv(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HBMV
+void cblas_chbmv(const Layout layout, const Triangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhbmv(const Layout layout, const Triangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HPMV
+void cblas_chpmv(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float2>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhpmv(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double2>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// SYMV
+void cblas_ssymv(const Layout layout, const Triangle triangle,
+ const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dsymv(const Layout layout, const Triangle triangle,
+ const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// SBMV
+void cblas_ssbmv(const Layout layout, const Triangle triangle,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dsbmv(const Layout layout, const Triangle triangle,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// SPMV
+void cblas_sspmv(const Layout layout, const Triangle triangle,
+ const int n,
+ const float alpha,
+ const float* ap,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dspmv(const Layout layout, const Triangle triangle,
+ const int n,
+ const double alpha,
+ const double* ap,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// TRMV
+void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = Trmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = Trmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = Trmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = Trmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TBMV
+void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = Tbmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = Tbmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = Tbmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = Tbmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TPMV
+void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = Tpmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = Tpmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float2>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = Tpmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double2>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = Tpmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TRSV
+void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = Trsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = Trsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = Trsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = Trsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TBSV
+void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = Tbsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = Tbsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = Tbsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = Tbsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TPSV
+void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = Tpsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = Tpsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float2>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = Tpsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double2>(context, ap_size);
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = Tpsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// GER
+void cblas_sger(const Layout layout,
+ const int m, const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dger(const Layout layout,
+ const int m, const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// GERU
+void cblas_cgeru(const Layout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zgeru(const Layout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// GERC
+void cblas_cgerc(const Layout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zgerc(const Layout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HER
+void cblas_cher(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zher(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HPR
+void cblas_chpr(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
+ auto queue_cl = queue();
+ auto s = Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float2*>(ap));
+}
+void cblas_zhpr(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
+ auto queue_cl = queue();
+ auto s = Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double2*>(ap));
+}
+
+// HER2
+void cblas_cher2(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zher2(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HPR2
+void cblas_chpr2(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<float2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float2>(context, y_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
+ auto queue_cl = queue();
+ auto s = Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float2*>(ap));
+}
+void cblas_zhpr2(const Layout layout, const Triangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n;
+ auto x_buffer = Buffer<double2>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double2>(context, y_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
+ auto queue_cl = queue();
+ auto s = Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double2*>(ap));
+}
+
+// SYR
+void cblas_ssyr(const Layout layout, const Triangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dsyr(const Layout layout, const Triangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// SPR
+void cblas_sspr(const Layout layout, const Triangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
+ auto queue_cl = queue();
+ auto s = Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float*>(ap));
+}
+void cblas_dspr(const Layout layout, const Triangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
+ auto queue_cl = queue();
+ auto s = Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double*>(ap));
+}
+
+// SYR2
+void cblas_ssyr2(const Layout layout, const Triangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dsyr2(const Layout layout, const Triangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// SPR2
+void cblas_sspr2(const Layout layout, const Triangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<float>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<float>(context, y_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<float>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
+ auto queue_cl = queue();
+ auto s = Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float*>(ap));
+}
+void cblas_dspr2(const Layout layout, const Triangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* ap) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n;
+ auto x_buffer = Buffer<double>(context, x_size);
+ const auto y_size = n;
+ auto y_buffer = Buffer<double>(context, y_size);
+ const auto ap_size = n;
+ auto ap_buffer = Buffer<double>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
+ auto queue_cl = queue();
+ auto s = Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double*>(ap));
+}
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// GEMM
+void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const int m, const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const int m, const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYMM
+void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csymm(const Layout layout, const Side side, const Triangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HEMM
+void cblas_chemm(const Layout layout, const Side side, const Triangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYRK
+void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HERK
+void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYR2K
+void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HER2K
+void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = beta;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ const auto c_size = n;
+ auto c_buffer = Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// TRMM
+void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// TRSM
+void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// OMATCOPY
+void cblas_somatcopy(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = n;
+ auto a_buffer = Buffer<float>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_domatcopy(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = n;
+ auto a_buffer = Buffer<double>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_comatcopy(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<float2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_zomatcopy(const Layout layout, const Transpose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = Context(device);
+ auto queue = Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = n;
+ auto a_buffer = Buffer<double2>(context, a_size);
+ const auto b_size = n;
+ auto b_buffer = Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// =================================================================================================
+} // namespace clblast