summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-11-24 19:35:59 +0100
committerGitHub <noreply@github.com>2016-11-24 19:35:59 +0100
commitcb398f0e42fa8e8cae201ffe8b53b60c5ba62d29 (patch)
treec4b63cf8c39e673765f142bbed2fff655500dfee
parent88ba1f4db9c0c0f9059e441d6d5988e3849b5849 (diff)
parent2ff3f77392dc7395abf03d3864c42ff894918889 (diff)
Merge pull request #125 from CNugteren/netlib_blas_api
Netlib CBLAS API for CLBlast
-rw-r--r--CHANGELOG1
-rw-r--r--CMakeLists.txt10
-rw-r--r--README.md4
-rw-r--r--include/clblast_c.h5
-rw-r--r--include/clblast_netlib_c.h920
-rw-r--r--samples/sgemm_netlib.c69
-rwxr-xr-xscripts/generator/generator.py151
-rw-r--r--scripts/generator/generator/cpp.py73
-rw-r--r--scripts/generator/generator/datatype.py20
-rw-r--r--scripts/generator/generator/routine.py137
-rw-r--r--src/clblast_netlib_c.cpp4648
-rw-r--r--src/routines/level1/xscal.cpp4
-rw-r--r--src/utilities/utilities.cpp4
13 files changed, 5984 insertions, 62 deletions
diff --git a/CHANGELOG b/CHANGELOG
index b679a435..0e25d1f5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
Development version (next release)
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
- Changed the enums in the C API to avoid potential name clashes with external code
+- Added a Netlib CBLAS compatible API (not recommended for full control over performance)
- Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
- Fixed a bug in the tests and samples related to waiting for an invalid event
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5edbd75..246d006c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ option(SAMPLES "Enable compilation of the examples" OFF)
option(TUNERS "Enable compilation of the tuners" OFF)
option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
option(TESTS "Enable compilation of the correctness tests" OFF)
+option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
# Compile in verbose mode with additional diagnostic messages
option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@@ -151,6 +152,9 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
xgemm xgemm_direct xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+if(NETLIB)
+ set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
+endif()
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@@ -172,6 +176,9 @@ set(SOURCES
src/clblast_c.cpp
src/routine.cpp
)
+if(NETLIB)
+ set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
+endif()
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
endforeach()
@@ -213,6 +220,9 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)
install(FILES include/clblast_half.h DESTINATION include)
+if(NETLIB)
+ install(FILES include/clblast_netlib_c.h DESTINATION include)
+endif()
# Installs the config for find_package in dependent projects
install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
diff --git a/README.md b/README.md
index 9b289448..15b65c06 100644
--- a/README.md
+++ b/README.md
@@ -96,6 +96,10 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t
Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
+
+ #include <clblast_netlib_c.h>
+
Using the tuners (optional)
-------------
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 81f093cd..72f50d83 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -117,11 +117,6 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
-// Precision scoped enum (values in bits)
-typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
- CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
- CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
-
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================
diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h
new file mode 100644
index 00000000..b5577cfa
--- /dev/null
+++ b/include/clblast_netlib_c.h
@@ -0,0 +1,920 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer
+// copies automatically and running on the default OpenCL platform and device. For full control over
+// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_NETLIB_C_H_
+#define CLBLAST_CLBLAST_NETLIB_C_H_
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+ #if defined(COMPILING_DLL)
+ #define PUBLIC_API __declspec(dllexport)
+ #else
+ #define PUBLIC_API __declspec(dllimport)
+ #endif
+#else
+ #define PUBLIC_API
+#endif
+
+// The C interface
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =================================================================================================
+
+// Matrix layout and transpose types
+typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
+ CLBlastLayoutColMajor = 102 } CLBlastLayout;
+typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
+ CLBlastTransposeConjugate = 113 } CLBlastTranspose;
+typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
+ CLBlastTriangleLower = 122 } CLBlastTriangle;
+typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
+ CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
+typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
+
+// For full compatibility with CBLAS
+typedef CLBlastLayout CBLAS_ORDER;
+typedef CLBlastTranspose CBLAS_TRANSPOSE;
+typedef CLBlastTriangle CBLAS_UPLO;
+typedef CLBlastDiagonal CBLAS_DIAG;
+typedef CLBlastSide CBLAS_SIDE;
+#define CblasRowMajor CLBlastLayoutRowMajor
+#define CblasColMajor CLBlastLayoutColMajor
+#define CblasNoTrans CLBlastTransposeNo
+#define CblasTrans CLBlastTransposeYes
+#define CblasConjTrans CLBlastTransposeConjugate
+#define CblasUpper CLBlastTriangleUpper
+#define CblasLower CLBlastTriangleLower
+#define CblasNonUnit CLBlastDiagonalNonUnit
+#define CblasUnit CLBlastDiagonalUnit
+#define CblasLeft CLBlastSideLeft
+#define CblasRight CLBlastSideRight
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Generate givens plane rotation: SROTG/DROTG
+void PUBLIC_API cblas_srotg(float* sa,
+ float* sb,
+ float* sc,
+ float* ss);
+void PUBLIC_API cblas_drotg(double* sa,
+ double* sb,
+ double* sc,
+ double* ss);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+void PUBLIC_API cblas_srotmg(float* sd1,
+ float* sd2,
+ float* sx1,
+ const float sy1,
+ float* sparam);
+void PUBLIC_API cblas_drotmg(double* sd1,
+ double* sd2,
+ double* sx1,
+ const double sy1,
+ double* sparam);
+
+// Apply givens plane rotation: SROT/DROT
+void PUBLIC_API cblas_srot(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ const float cos,
+ const float sin);
+void PUBLIC_API cblas_drot(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ const double cos,
+ const double sin);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+void PUBLIC_API cblas_srotm(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ float* sparam);
+void PUBLIC_API cblas_drotm(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ double* sparam);
+
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
+void PUBLIC_API cblas_sswap(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_dswap(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc);
+void PUBLIC_API cblas_cswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc);
+
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
+void PUBLIC_API cblas_sscal(const int n,
+ const float alpha,
+ float* x, const int x_inc);
+void PUBLIC_API cblas_dscal(const int n,
+ const double alpha,
+ double* x, const int x_inc);
+void PUBLIC_API cblas_cscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc);
+void PUBLIC_API cblas_zscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc);
+
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
+void PUBLIC_API cblas_scopy(const int n,
+ const float* x, const int x_inc,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_dcopy(const int n,
+ const double* x, const int x_inc,
+ double* y, const int y_inc);
+void PUBLIC_API cblas_ccopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zcopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc);
+
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
+void PUBLIC_API cblas_saxpy(const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_daxpy(const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* y, const int y_inc);
+void PUBLIC_API cblas_caxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zaxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc);
+
+// Dot product of two vectors: SDOT/DDOT/HDOT
+float PUBLIC_API cblas_sdot(const int n,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc);
+double PUBLIC_API cblas_ddot(const int n,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc);
+
+// Dot product of two complex vectors: CDOTU/ZDOTU
+void PUBLIC_API cblas_cdotu_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot);
+void PUBLIC_API cblas_zdotu_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot);
+
+// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
+void PUBLIC_API cblas_cdotc_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot);
+void PUBLIC_API cblas_zdotc_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
+float PUBLIC_API cblas_snrm2(const int n,
+ const float* x, const int x_inc);
+double PUBLIC_API cblas_dnrm2(const int n,
+ const double* x, const int x_inc);
+float PUBLIC_API cblas_scnrm2(const int n,
+ const void* x, const int x_inc);
+double PUBLIC_API cblas_dznrm2(const int n,
+ const void* x, const int x_inc);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
+float PUBLIC_API cblas_sasum(const int n,
+ const float* x, const int x_inc);
+double PUBLIC_API cblas_dasum(const int n,
+ const double* x, const int x_inc);
+float PUBLIC_API cblas_scasum(const int n,
+ const void* x, const int x_inc);
+double PUBLIC_API cblas_dzasum(const int n,
+ const void* x, const int x_inc);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
+float PUBLIC_API cblas_ssum(const int n,
+ const float* x, const int x_inc);
+double PUBLIC_API cblas_dsum(const int n,
+ const double* x, const int x_inc);
+float PUBLIC_API cblas_scsum(const int n,
+ const void* x, const int x_inc);
+double PUBLIC_API cblas_dzsum(const int n,
+ const void* x, const int x_inc);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+int PUBLIC_API cblas_isamax(const int n,
+ const float* x, const int x_inc);
+int PUBLIC_API cblas_idamax(const int n,
+ const double* x, const int x_inc);
+int PUBLIC_API cblas_icamax(const int n,
+ const void* x, const int x_inc);
+int PUBLIC_API cblas_izamax(const int n,
+ const void* x, const int x_inc);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
+int PUBLIC_API cblas_ismax(const int n,
+ const float* x, const int x_inc);
+int PUBLIC_API cblas_idmax(const int n,
+ const double* x, const int x_inc);
+int PUBLIC_API cblas_icmax(const int n,
+ const void* x, const int x_inc);
+int PUBLIC_API cblas_izmax(const int n,
+ const void* x, const int x_inc);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
+int PUBLIC_API cblas_ismin(const int n,
+ const float* x, const int x_inc);
+int PUBLIC_API cblas_idmin(const int n,
+ const double* x, const int x_inc);
+int PUBLIC_API cblas_icmin(const int n,
+ const void* x, const int x_inc);
+int PUBLIC_API cblas_izmin(const int n,
+ const void* x, const int x_inc);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
+void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc);
+void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
+void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc);
+void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+
+// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
+void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+
+// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
+void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+
+// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
+void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc);
+
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
+void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc);
+
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
+void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc);
+
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
+void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* ap,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc);
+void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* ap,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc);
+
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
+void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc);
+void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc);
+void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
+void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc);
+void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc);
+void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
+void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc);
+void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc);
+void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc);
+void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc);
+
+// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
+void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc);
+void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc);
+void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+
+// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
+void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc);
+void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc);
+void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc);
+
+// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
+void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc);
+void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc);
+void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc);
+void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc);
+
+// General rank-1 matrix update: SGER/DGER/HGER
+void PUBLIC_API cblas_sger(const CLBlastLayout layout,
+ const int m, const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld);
+void PUBLIC_API cblas_dger(const CLBlastLayout layout,
+ const int m, const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld);
+
+// General rank-1 complex matrix update: CGERU/ZGERU
+void PUBLIC_API cblas_cgeru(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld);
+void PUBLIC_API cblas_zgeru(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld);
+
+// General rank-1 complex conjugated matrix update: CGERC/ZGERC
+void PUBLIC_API cblas_cgerc(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld);
+void PUBLIC_API cblas_zgerc(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld);
+
+// Hermitian rank-1 matrix update: CHER/ZHER
+void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld);
+void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld);
+
+// Hermitian packed rank-1 matrix update: CHPR/ZHPR
+void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const void* x, const int x_inc,
+ void* ap);
+void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const void* x, const int x_inc,
+ void* ap);
+
+// Hermitian rank-2 matrix update: CHER2/ZHER2
+void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld);
+void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld);
+
+// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
+void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap);
+void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap);
+
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
+void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* a, const int a_ld);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
+void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* ap);
+void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* ap);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
+void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld);
+
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
+void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* ap);
+void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* ap);
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
+void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld);
+void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld);
+void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
+void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld);
+void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld);
+void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+
+// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
+void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float beta,
+ float* c, const int c_ld);
+void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double beta,
+ double* c, const int c_ld);
+void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld);
+void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld);
+
+// Rank-K update of a hermitian matrix: CHERK/ZHERK
+void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const float alpha,
+ const void* a, const int a_ld,
+ const float beta,
+ void* c, const int c_ld);
+void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const double alpha,
+ const void* a, const int a_ld,
+ const double beta,
+ void* c, const int c_ld);
+
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
+void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld);
+void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld);
+void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld);
+
+// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const float beta,
+ void* c, const int c_ld);
+void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const double beta,
+ void* c, const int c_ld);
+
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
+void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld);
+void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld);
+void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld);
+void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld);
+
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld);
+void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld);
+void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld);
+void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld);
+void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld);
+void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld);
+void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld);
+
+// =================================================================================================
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+// CLBLAST_CLBLAST_NETLIB_C_H_
+#endif
diff --git a/samples/sgemm_netlib.c b/samples/sgemm_netlib.c
new file mode 100644
index 00000000..0c8f76e9
--- /dev/null
+++ b/samples/sgemm_netlib.c
@@ -0,0 +1,69 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not
+// recommended if you want full control over performance: it will internally copy buffers from and
+// to the OpenCL device.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (Netlib CBLAS interface)
+#include <clblast_netlib_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SGEMM
+int main(void) {
+
+ // Example SGEMM arguments
+ const int m = 128;
+ const int n = 64;
+ const int k = 512;
+ const float alpha = 0.7f;
+ const float beta = 1.0f;
+ const int a_ld = k;
+ const int b_ld = n;
+ const int c_ld = n;
+
+ // Populate host matrices with some example data
+ float* host_a = (float*)malloc(sizeof(float)*m*k);
+ float* host_b = (float*)malloc(sizeof(float)*n*k);
+ float* host_c = (float*)malloc(sizeof(float)*m*n);
+ for (int i=0; i<m*k; ++i) { host_a[i] = 12.193f; }
+ for (int i=0; i<n*k; ++i) { host_b[i] = -8.199f; }
+ for (int i=0; i<m*n; ++i) { host_c[i] = 0.0f; }
+
+ // Call the SGEMM routine.
+ cblas_sgemm(CLBlastLayoutRowMajor,
+ CLBlastTransposeNo, CLBlastTransposeNo,
+ m, n, k,
+ alpha,
+ host_a, a_ld,
+ host_b, b_ld,
+ beta,
+ host_c, c_ld);
+
+ // Example completed
+ printf("Completed SGEMM\n");
+
+ // Clean-up
+ free(host_a);
+ free(host_b);
+ free(host_c);
+ return 0;
+}
+
+// =================================================================================================
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index aabce8d7..35d902b7 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -12,6 +12,8 @@
# clblast.cpp
# clblast_c.h
# clblast_c.cpp
+# clblast_netlib_c.h
+# clblast_netlib_c.cpp
# wrapper_clblas.h
# wrapper_cblas.h
# It also generates the main functions for the correctness and performance tests as found in
@@ -36,9 +38,11 @@ FILES = [
"/src/clblast_c.cpp",
"/test/wrapper_clblas.hpp",
"/test/wrapper_cblas.hpp",
+ "/include/clblast_netlib_c.h",
+ "/src/clblast_netlib_c.cpp",
]
-HEADER_LINES = [117, 73, 118, 22, 29, 41]
-FOOTER_LINES = [17, 80, 19, 18, 6, 6]
+HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
+FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
@@ -55,70 +59,105 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas
cld_m = "The value of `c_ld` must be at least `m`."
cld_n = "The value of `c_ld` must be at least `n`."
+
+# Helper functions to compute vector and matrix sizes
+def size_helper(condition, size_one, size_two, multiplier):
+ length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier
+ return length
+
+
+def layout_transpose_condition(prefix):
+ return "(layout == CLBlastLayoutColMajor && " + prefix + "_transpose != CLBlastTransposeNo) || " +\
+ "(layout == CLBlastLayoutRowMajor && " + prefix + "_transpose == CLBlastTransposeNo)"
+
+
+# Different possibilities for the vector and matrix sizes
+xn = "n * x_inc"
+xm = "m * x_inc"
+yn = "n * y_inc"
+ym = "m * y_inc"
+an = "n * a_ld"
+apn = "((n*(n+1)) / 2)"
+cn = "n * c_ld"
+xmn = size_helper("a_transpose != CLBlastTransposeNo", "m", "n", "x_inc")
+ynm = size_helper("a_transpose != CLBlastTransposeNo", "n", "m", "y_inc")
+amn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "a_ld")
+amns = size_helper("side == CLBlastSideLeft", "m", "n", "a_ld")
+amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld")
+ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld")
+ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld")
+bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld")
+bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld")
+bmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "b_ld")
+bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld")
+cmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "c_ld")
+ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSideLeft) ? m : n)", "a_ld")
+bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld")
+
# ==================================================================================================
# Populates a list of routines
ROUTINES = [
[ # Level 1: vector-vector
- Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
- Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
- Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
- Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
- Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
- Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
- Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
- Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
- Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
- Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
- Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
- Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
- Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
- Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
- Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
- Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
- Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+ Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []),
+ Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []),
+ Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []),
+ Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []),
+ Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+ Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+ Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+ Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+ Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+ Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+ Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+ Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+ Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+ Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+ Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+ Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+ Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
- Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
- Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
- Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
- Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
- Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
- Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
- Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
- Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
- Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
- Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
- Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
- Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
+ Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+ Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+ Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+ Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+ Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+ Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+ Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+ Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+ Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+ Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
+ Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
+ Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
- Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
- Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
- Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
- Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
- Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
- Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
- Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
- Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+ Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+ Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+ Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+ Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+ Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+ Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+ Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
],
[ # Level 3: matrix-matrix
- Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
- Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
- Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
- Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
- Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
- Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
- Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
- Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
- Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
+ Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+ Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+ Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+ Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+ Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+ Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+ Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+ Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+ Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []),
],
[ # Level X: extra routines (not part of BLAS)
- Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+ Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
]]
@@ -165,6 +204,10 @@ def main(argv):
body += cpp.wrapper_clblas(routine)
if i == 5:
body += cpp.wrapper_cblas(routine)
+ if i == 6:
+ body += cpp.clblast_netlib_c_h(routine)
+ if i == 7:
+ body += cpp.clblast_netlib_c_cc(routine)
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index 9f3089f5..6bb3080f 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -95,6 +95,79 @@ def clblast_c_cc(routine):
return result
+def clblast_netlib_c_h(routine):
+ """The Netlib CBLAS API header (.h)"""
+ result = NL + "// " + routine.description + ": " + routine.short_names() + NL
+ for flavour in routine.flavours:
+ if flavour.precision_name in ["S", "D", "C", "Z"]:
+ result += routine.routine_header_netlib(flavour, 20, " PUBLIC_API") + ";" + NL
+ return result
+
+
+def clblast_netlib_c_cc(routine):
+ """The Netlib CBLAS API implementation (.cpp)"""
+ result = NL + "// " + routine.name.upper() + NL
+ for flavour in routine.flavours:
+
+ # There is a version available in CBLAS
+ if flavour.precision_name in ["S", "D", "C", "Z"]:
+ template = "<" + flavour.template + ">" if routine.no_scalars() else ""
+ name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else ""
+ indent = " " * (21 + routine.length() + len(template))
+ result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL
+
+ # Initialize OpenCL
+ result += " auto device = get_device();" + NL
+ result += " auto context = clblast::Context(device);" + NL
+ result += " auto queue = clblast::Queue(context, device);" + NL
+
+ # Set alpha and beta
+ result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour))
+
+ # Copy data structures to the device
+ for i, name in enumerate(routine.inputs + routine.outputs):
+ result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL
+ for i, name in enumerate(routine.inputs + routine.outputs):
+ buffer_type = routine.get_buffer_type(name, flavour)
+ result += " " + routine.create_buffer(name, buffer_type) + NL
+ if name in routine.scalar_buffers_second_non_pointer():
+ result += " " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL
+ for name in routine.inputs + routine.outputs:
+ if name not in routine.scalar_buffers_first():
+ prefix = "" if name in routine.outputs else "const "
+ buffer_type = routine.get_buffer_type(name, flavour)
+ result += " " + routine.write_buffer(name, prefix + buffer_type) + NL
+
+ # The function call
+ result += " auto queue_cl = queue();" + NL
+ result += " auto s = clblast::" + routine.name.capitalize() + template + "("
+ result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)])
+ result += "," + NL + indent + "&queue_cl);" + NL
+
+ # Error handling
+ result += " if (s != clblast::StatusCode::kSuccess) {" + NL
+ result += " throw std::runtime_error(\"CLBlast returned with error code \" + clblast::ToString(s));" + NL
+ result += " }" + NL
+
+ # Copy back and clean-up
+ for name in routine.outputs:
+ if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
+ buffer_type = routine.get_buffer_type(name, flavour)
+ result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL
+ for name in routine.outputs:
+ buffer_type = routine.get_buffer_type(name, flavour)
+ result += " " + routine.read_buffer(name, buffer_type) + NL
+ for name in routine.outputs:
+ if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
+ result += " return " + name + "[0]"
+ if flavour.buffer_type in ["float2", "double2"]:
+ if name not in routine.index_buffers():
+ result += ".real()"
+ result += ";" + NL
+ result += "}" + NL
+ return result
+
+
def wrapper_clblas(routine):
"""The wrapper to the reference clBLAS routines (for performance/correctness testing)"""
result = ""
diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py
index 9a6c6c02..98874174 100644
--- a/scripts/generator/generator/datatype.py
+++ b/scripts/generator/generator/datatype.py
@@ -54,6 +54,22 @@ class DataType:
return self.beta_cl + "{{beta.real(), beta.imag()}}"
return "beta"
+ def use_alpha_clblast(self):
+ """Transforms a Netlib CBLAS parameter to CLBlast style"""
+ if self.alpha_cpp == D_FLOAT2:
+ return self.alpha_cpp + "{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}"
+ elif self.alpha_cpp == D_DOUBLE2:
+ return self.alpha_cpp + "{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}"
+ return "alpha"
+
+ def use_beta_clblast(self):
+ """As above, but for beta instead of alpha"""
+ if self.beta_cpp == D_FLOAT2:
+ return self.beta_cpp + "{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}"
+ elif self.beta_cpp == D_DOUBLE2:
+ return self.beta_cpp + "{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}"
+ return "beta"
+
def test_template(self):
"""Returns the template as used in the correctness/performance tests"""
if self.buffer_type != self.beta_cpp:
@@ -65,6 +81,10 @@ class DataType:
return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or
(scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2]))
+ def is_non_standard(self):
+ """Current type is of a non-standard type"""
+ return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2]
+
# Regular data-types
H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16)
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 2fa5e9d6..6fcce23b 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -13,7 +13,8 @@ import generator.convert as convert
class Routine:
"""Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
- inputs, outputs, scalars, scratch, description, details, requirements):
+ inputs, outputs, buffer_sizes, scalars, scratch,
+ description, details, requirements):
self.implemented = implemented
self.has_tests = has_tests
self.level = level
@@ -24,6 +25,7 @@ class Routine:
self.options = options
self.inputs = inputs
self.outputs = outputs
+ self.buffer_sizes = buffer_sizes
self.scalars = scalars
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
self.description = description
@@ -41,6 +43,11 @@ class Routine:
return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"]
@staticmethod
+ def scalar_buffers_second_non_pointer():
+ """As above, but these ones are not passed as pointers but as scalars instead"""
+ return ["sy1"]
+
+ @staticmethod
def other_scalars():
"""List of scalars other than alpha and beta"""
return ["cos", "sin"]
@@ -65,6 +72,34 @@ class Routine:
"""Distinguish between vectors and matrices"""
return ["a", "b", "c", "ap"]
+ @staticmethod
+ def routines_scalar_no_return():
+ return ["dotu", "dotc"]
+
+ @staticmethod
+ def set_size(name, size):
+ """Sets the size of a buffer"""
+ return "const auto " + name + "_size = " + size + ";"
+
+ @staticmethod
+ def create_buffer(name, template):
+ """Creates a new CLCudaAPI buffer"""
+ return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);"
+
+ def write_buffer(self, name, template):
+ """Writes to a CLCudaAPI buffer"""
+ postfix = ""
+ if name in self.scalar_buffers_second_non_pointer():
+ postfix = "_vec"
+ data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")"
+ return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");"
+
+ @staticmethod
+ def read_buffer(name, template):
+ """Reads from a CLCudaAPI buffer"""
+ data_structure = "reinterpret_cast<" + template + "*>(" + name + ")"
+ return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");"
+
def non_index_inputs(self):
"""Lists of input/output buffers not index (integer)"""
buffers = self.inputs[:] # make a copy
@@ -85,6 +120,11 @@ class Routine:
"""List of buffers without 'inc' or 'ld'"""
return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"]
+ def get_buffer_type(self, name, flavour):
+ if name in self.index_buffers():
+ return "int"
+ return flavour.buffer_type
+
def length(self):
"""Retrieves the number of characters in the routine's name"""
return len(self.name)
@@ -133,6 +173,15 @@ class Routine:
return [", ".join(a + b + c)]
return []
+ def buffer_zero_offset(self, name):
+ """As above, but with an offset value of zero"""
+ if name in self.inputs or name in self.outputs:
+ a = [name + "_buffer()"]
+ b = ["0"]
+ c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
+ return [", ".join(a + b + c)]
+ return []
+
def buffer_def(self, name):
"""As above but with data-types"""
prefix = "const " if name in self.inputs else ""
@@ -163,6 +212,17 @@ class Routine:
return [", ".join(a + b + c)]
return []
+ def buffer_def_pointer(self, name, flavour):
+ """As above but as plain C pointer"""
+ prefix = "const " if name in self.inputs else ""
+ if name in self.inputs or name in self.outputs:
+ data_type = "void" if flavour.is_non_standard() else flavour.buffer_type
+ pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*"
+ a = [prefix + data_type + pointer + " " + name + ""]
+ c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+ return [", ".join(a + c)]
+ return []
+
def buffer_clcudaapi(self, name):
"""As above but with CLCudaAPI buffers"""
if name in self.inputs or name in self.outputs:
@@ -238,6 +298,12 @@ class Routine:
return [name]
return []
+ def scalar_cpp(self, name):
+ """As above, but with _cpp as a suffix"""
+ if name in self.scalars:
+ return [name + "_cpp"]
+ return []
+
def scalar_half_to_float(self, name):
"""As above, but converts from float to half"""
if name in self.scalars:
@@ -288,6 +354,16 @@ class Routine:
return ["const " + flavour.beta_cpp + " " + name]
return []
+ def scalar_def_void(self, name, flavour):
+ """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types"""
+ if name in self.scalars:
+ if name == "alpha":
+ data_type = "void*" if flavour.is_complex("alpha") else flavour.alpha_cpp
+ return ["const " + data_type + " " + name]
+ data_type = "void*" if flavour.is_complex("beta") else flavour.beta_cpp
+ return ["const " + data_type + " " + name]
+ return []
+
def scalar_type(self, name, flavour):
"""Retrieves the type of a scalar (alpha/beta)"""
if name in self.scalars:
@@ -304,6 +380,16 @@ class Routine:
return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
return []
+ def scalar_create_cpp(self, flavour):
+ """Creates a C++ version of a scalar based on a void*"""
+ result = []
+ for name in self.scalars:
+ if name == "alpha":
+ result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";")
+ elif name == "beta":
+ result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";")
+ return result
+
def sizes_list(self):
"""Retrieves a list of comma-separated sizes (m, n, k)"""
if self.sizes:
@@ -316,6 +402,12 @@ class Routine:
return [", ".join(["const size_t " + s for s in self.sizes])]
return []
+ def sizes_def_netlib(self):
+ """Retrieves the definition of the sizes (m,n,k) for the CBLAS API"""
+ if self.sizes:
+ return [", ".join(["const int " + s for s in self.sizes])]
+ return []
+
def sizes_type(self):
"""Retrieves the types of the sizes (m,n,k)"""
if self.sizes:
@@ -428,6 +520,17 @@ class Routine:
list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
+ def arguments_netlib(self, flavour, indent):
+ """As above, but for the Netlib CBLAS API"""
+ return (self.options_cast(indent) + self.sizes_list() +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) +
+ self.scalar_cpp("alpha") +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) +
+ self.scalar_cpp("beta") +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) +
+ list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) +
+ list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+
def arguments_wrapper_clblas(self, flavour):
"""As above, but for the clBLAS wrapper"""
return (self.options_list() + self.sizes_list() +
@@ -460,6 +563,19 @@ class Routine:
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+ def arguments_def_netlib(self, flavour):
+ """As above, but for the Netlib CBLAS API"""
+ result=(self.options_def_c() + self.sizes_def_netlib() +
+ self.scalar_def_void("alpha", flavour) +
+ list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) +
+ self.scalar_def_void("beta", flavour) +
+ list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) +
+ list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
+ list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+ if self.name in self.routines_scalar_no_return():
+ result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
+ return result
+
def arguments_def_c(self, flavour):
"""As above, but for the C API"""
return (self.options_def_c() + self.sizes_def() +
@@ -546,6 +662,25 @@ class Routine:
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
return result
+ def routine_header_netlib(self, flavour, spaces, extra_qualifier):
+ """As above, but now for the original Netlib CBLAS API"""
+ return_type = "void"
+ for output in self.outputs:
+ if output in self.index_buffers():
+ return_type = "int"
+ break
+ if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return():
+ return_type = flavour.buffer_type.replace("2", "")
+ break
+ indent = " " * (spaces + len(return_type) + self.length())
+ routine_name = self.name
+ if self.name in self.routines_scalar_no_return():
+ routine_name += "_sub"
+ indent += " "
+ result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
+ result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
+ return result
+
def routine_header_wrapper_clblas(self, flavour, def_only, spaces):
"""As above, but now for the clBLAS wrapper"""
template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp
new file mode 100644
index 00000000..3fbabd43
--- /dev/null
+++ b/src/clblast_netlib_c.cpp
@@ -0,0 +1,4648 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer
+// copies automatically and running on the default OpenCL platform and device. For full control over
+// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
+//
+// =================================================================================================
+
+#include <cstdlib>
+
+#include "clblast_netlib_c.h"
+#include "clblast.h"
+#include "utilities/utilities.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Helper function to get a default OpenCL platform and device
+clblast::Device get_device() {
+ auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0});
+ auto device_id = clblast::ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0});
+ auto platform = clblast::Platform(platform_id);
+ return clblast::Device(platform, device_id);
+}
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// ROTG
+void cblas_srotg(float* sa,
+ float* sb,
+ float* sc,
+ float* ss) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sa_size = 1;
+ const auto sb_size = 1;
+ const auto sc_size = 1;
+ const auto ss_size = 1;
+ auto sa_buffer = clblast::Buffer<float>(context, sa_size);
+ auto sb_buffer = clblast::Buffer<float>(context, sb_size);
+ auto sc_buffer = clblast::Buffer<float>(context, sc_size);
+ auto ss_buffer = clblast::Buffer<float>(context, ss_size);
+ sa_buffer.Write(queue, sa_size, reinterpret_cast<float*>(sa));
+ sb_buffer.Write(queue, sb_size, reinterpret_cast<float*>(sb));
+ sc_buffer.Write(queue, sc_size, reinterpret_cast<float*>(sc));
+ ss_buffer.Write(queue, ss_size, reinterpret_cast<float*>(ss));
+ auto queue_cl = queue();
+ auto s = clblast::Rotg<float>(sa_buffer(), 0,
+ sb_buffer(), 0,
+ sc_buffer(), 0,
+ ss_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sa_buffer.Read(queue, sa_size, reinterpret_cast<float*>(sa));
+ sb_buffer.Read(queue, sb_size, reinterpret_cast<float*>(sb));
+ sc_buffer.Read(queue, sc_size, reinterpret_cast<float*>(sc));
+ ss_buffer.Read(queue, ss_size, reinterpret_cast<float*>(ss));
+}
+void cblas_drotg(double* sa,
+ double* sb,
+ double* sc,
+ double* ss) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sa_size = 1;
+ const auto sb_size = 1;
+ const auto sc_size = 1;
+ const auto ss_size = 1;
+ auto sa_buffer = clblast::Buffer<double>(context, sa_size);
+ auto sb_buffer = clblast::Buffer<double>(context, sb_size);
+ auto sc_buffer = clblast::Buffer<double>(context, sc_size);
+ auto ss_buffer = clblast::Buffer<double>(context, ss_size);
+ sa_buffer.Write(queue, sa_size, reinterpret_cast<double*>(sa));
+ sb_buffer.Write(queue, sb_size, reinterpret_cast<double*>(sb));
+ sc_buffer.Write(queue, sc_size, reinterpret_cast<double*>(sc));
+ ss_buffer.Write(queue, ss_size, reinterpret_cast<double*>(ss));
+ auto queue_cl = queue();
+ auto s = clblast::Rotg<double>(sa_buffer(), 0,
+ sb_buffer(), 0,
+ sc_buffer(), 0,
+ ss_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sa_buffer.Read(queue, sa_size, reinterpret_cast<double*>(sa));
+ sb_buffer.Read(queue, sb_size, reinterpret_cast<double*>(sb));
+ sc_buffer.Read(queue, sc_size, reinterpret_cast<double*>(sc));
+ ss_buffer.Read(queue, ss_size, reinterpret_cast<double*>(ss));
+}
+
+// ROTMG
+void cblas_srotmg(float* sd1,
+ float* sd2,
+ float* sx1,
+ const float sy1,
+ float* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sy1_size = 1;
+ const auto sd1_size = 1;
+ const auto sd2_size = 1;
+ const auto sx1_size = 1;
+ const auto sparam_size = 1;
+ auto sy1_buffer = clblast::Buffer<float>(context, sy1_size);
+ float sy1_vec[1]; sy1_vec[0] = sy1;
+ auto sd1_buffer = clblast::Buffer<float>(context, sd1_size);
+ auto sd2_buffer = clblast::Buffer<float>(context, sd2_size);
+ auto sx1_buffer = clblast::Buffer<float>(context, sx1_size);
+ auto sparam_buffer = clblast::Buffer<float>(context, sparam_size);
+ sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const float*>(sy1_vec));
+ sd1_buffer.Write(queue, sd1_size, reinterpret_cast<float*>(sd1));
+ sd2_buffer.Write(queue, sd2_size, reinterpret_cast<float*>(sd2));
+ sx1_buffer.Write(queue, sx1_size, reinterpret_cast<float*>(sx1));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<float*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotmg<float>(sd1_buffer(), 0,
+ sd2_buffer(), 0,
+ sx1_buffer(), 0,
+ sy1_buffer(), 0,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sd1_buffer.Read(queue, sd1_size, reinterpret_cast<float*>(sd1));
+ sd2_buffer.Read(queue, sd2_size, reinterpret_cast<float*>(sd2));
+ sx1_buffer.Read(queue, sx1_size, reinterpret_cast<float*>(sx1));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<float*>(sparam));
+}
+void cblas_drotmg(double* sd1,
+ double* sd2,
+ double* sx1,
+ const double sy1,
+ double* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sy1_size = 1;
+ const auto sd1_size = 1;
+ const auto sd2_size = 1;
+ const auto sx1_size = 1;
+ const auto sparam_size = 1;
+ auto sy1_buffer = clblast::Buffer<double>(context, sy1_size);
+ double sy1_vec[1]; sy1_vec[0] = sy1;
+ auto sd1_buffer = clblast::Buffer<double>(context, sd1_size);
+ auto sd2_buffer = clblast::Buffer<double>(context, sd2_size);
+ auto sx1_buffer = clblast::Buffer<double>(context, sx1_size);
+ auto sparam_buffer = clblast::Buffer<double>(context, sparam_size);
+ sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const double*>(sy1_vec));
+ sd1_buffer.Write(queue, sd1_size, reinterpret_cast<double*>(sd1));
+ sd2_buffer.Write(queue, sd2_size, reinterpret_cast<double*>(sd2));
+ sx1_buffer.Write(queue, sx1_size, reinterpret_cast<double*>(sx1));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<double*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotmg<double>(sd1_buffer(), 0,
+ sd2_buffer(), 0,
+ sx1_buffer(), 0,
+ sy1_buffer(), 0,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sd1_buffer.Read(queue, sd1_size, reinterpret_cast<double*>(sd1));
+ sd2_buffer.Read(queue, sd2_size, reinterpret_cast<double*>(sd2));
+ sx1_buffer.Read(queue, sx1_size, reinterpret_cast<double*>(sx1));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<double*>(sparam));
+}
+
+// ROT
+void cblas_srot(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ const float cos,
+ const float sin) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Rot(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ cos,
+ sin,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_drot(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ const double cos,
+ const double sin) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Rot(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ cos,
+ sin,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// ROTM
+void cblas_srotm(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ float* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto sparam_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto sparam_buffer = clblast::Buffer<float>(context, sparam_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<float*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotm<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<float*>(sparam));
+}
+void cblas_drotm(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ double* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto sparam_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto sparam_buffer = clblast::Buffer<double>(context, sparam_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<double*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotm<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<double*>(sparam));
+}
+
+// SWAP
+void cblas_sswap(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dswap(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<float2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<double2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// SCAL
+void cblas_sscal(const int n,
+ const float alpha,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dscal(const int n,
+ const double alpha,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_cscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_zscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// COPY
+void cblas_scopy(const int n,
+ const float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dcopy(const int n,
+ const double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_ccopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<float2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zcopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<double2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// AXPY
+void cblas_saxpy(const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_daxpy(const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_caxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zaxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// DOT
+float cblas_sdot(const int n,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto dot_buffer = clblast::Buffer<float>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dot<float>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float dot[dot_size];
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float*>(dot));
+ return dot[0];
+}
+double cblas_ddot(const int n,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto dot_buffer = clblast::Buffer<double>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dot<double>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double dot[dot_size];
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double*>(dot));
+ return dot[0];
+}
+
+// DOTU
+void cblas_cdotu_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<float2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotu<float2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
+}
+void cblas_zdotu_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<double2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotu<double2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
+}
+
+// DOTC
+void cblas_cdotc_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<float2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotc<float2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
+}
+void cblas_zdotc_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<double2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotc<double2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
+}
+
+// NRM2
+float cblas_snrm2(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<float>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<float>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<float*>(nrm2));
+ return nrm2[0];
+}
+double cblas_dnrm2(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<double>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<double>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<double*>(nrm2));
+ return nrm2[0];
+}
+float cblas_scnrm2(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<float2>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<float2>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float2 nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<float2*>(nrm2));
+ return nrm2[0].real();
+}
+double cblas_dznrm2(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<double2>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<double2>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double2 nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<double2*>(nrm2));
+ return nrm2[0].real();
+}
+
+// ASUM
+float cblas_sasum(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto asum_buffer = clblast::Buffer<float>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<float>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<float*>(asum));
+ return asum[0];
+}
+double cblas_dasum(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto asum_buffer = clblast::Buffer<double>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<double>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<double*>(asum));
+ return asum[0];
+}
+float cblas_scasum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto asum_buffer = clblast::Buffer<float2>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<float2>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float2 asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<float2*>(asum));
+ return asum[0].real();
+}
+double cblas_dzasum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto asum_buffer = clblast::Buffer<double2>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<double2>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double2 asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<double2*>(asum));
+ return asum[0].real();
+}
+
+// SUM
+float cblas_ssum(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto sum_buffer = clblast::Buffer<float>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<float>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<float*>(sum));
+ return sum[0];
+}
+double cblas_dsum(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto sum_buffer = clblast::Buffer<double>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<double>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<double*>(sum));
+ return sum[0];
+}
+float cblas_scsum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto sum_buffer = clblast::Buffer<float2>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<float2>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float2 sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<float2*>(sum));
+ return sum[0].real();
+}
+double cblas_dzsum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto sum_buffer = clblast::Buffer<double2>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<double2>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double2 sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<double2*>(sum));
+ return sum[0].real();
+}
+
+// AMAX
+int cblas_isamax(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<float>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_idamax(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<double>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_icamax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<float2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_izamax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<double2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+
+// MAX
+int cblas_ismax(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<float>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_idmax(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<double>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_icmax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<float2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_izmax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<double2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+
+// MIN
+int cblas_ismin(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<float>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+int cblas_idmin(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<double>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+int cblas_icmin(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<float2>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+int cblas_izmin(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<double2>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// GEMV
+void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// GBMV
+void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HEMV
+void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HBMV
+void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HPMV
+void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// SYMV
+void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// SBMV
+void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// SPMV
+void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* ap,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* ap,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// TRMV
+void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TBMV
+void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TPMV
+void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TRSV
+void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TBSV
+void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TPSV
+void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// GER
+void cblas_sger(const CLBlastLayout layout,
+ const int m, const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dger(const CLBlastLayout layout,
+ const int m, const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// GERU
+void cblas_cgeru(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zgeru(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// GERC
+void cblas_cgerc(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zgerc(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HER
+void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HPR
+void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const void* x, const int x_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float2*>(ap));
+}
+void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const void* x, const int x_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double2*>(ap));
+}
+
+// HER2
+void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HPR2
+void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float2*>(ap));
+}
+void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double2*>(ap));
+}
+
+// SYR
+void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// SPR
+void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float*>(ap));
+}
+void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double*>(ap));
+}
+
+// SYR2
+void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// SPR2
+void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float*>(ap));
+}
+void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double*>(ap));
+}
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// GEMM
+void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYMM
+void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HEMM
+void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYRK
+void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HERK
+void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const float alpha,
+ const void* a, const int a_ld,
+ const float beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const double alpha,
+ const void* a, const int a_ld,
+ const double beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYR2K
+void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HER2K
+void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const float beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const double beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// TRMM
+void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// TRSM
+void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// OMATCOPY
+void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// =================================================================================================
diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp
index 17410f01..0521b1e5 100644
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@@ -55,12 +55,12 @@ void Xscal<T>::DoScal(const size_t n, const T alpha,
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
+ kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
+ kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index b4a18311..24456252 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -151,6 +151,10 @@ std::string ToString(Precision value) {
case Precision::kComplexDouble: return ToString(static_cast<int>(value))+" (complex-double)";
}
}
+template <>
+std::string ToString(StatusCode value) {
+ return std::to_string(static_cast<int>(value));
+}
// =================================================================================================