summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-11-24 19:35:59 +0100
committerGitHub <noreply@github.com>2016-11-24 19:35:59 +0100
commitcb398f0e42fa8e8cae201ffe8b53b60c5ba62d29 (patch)
treec4b63cf8c39e673765f142bbed2fff655500dfee /src
parent88ba1f4db9c0c0f9059e441d6d5988e3849b5849 (diff)
parent2ff3f77392dc7395abf03d3864c42ff894918889 (diff)
Merge pull request #125 from CNugteren/netlib_blas_api
Netlib CBLAS API for CLBlast
Diffstat (limited to 'src')
-rw-r--r--src/clblast_netlib_c.cpp4648
-rw-r--r--src/routines/level1/xscal.cpp4
-rw-r--r--src/utilities/utilities.cpp4
3 files changed, 4654 insertions, 2 deletions
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp
new file mode 100644
index 00000000..3fbabd43
--- /dev/null
+++ b/src/clblast_netlib_c.cpp
@@ -0,0 +1,4648 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer
+// copies automatically and running on the default OpenCL platform and device. For full control over
+// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
+//
+// =================================================================================================
+
+#include <cstdlib>
+
+#include "clblast_netlib_c.h"
+#include "clblast.h"
+#include "utilities/utilities.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Helper function to get a default OpenCL platform and device
+clblast::Device get_device() {
+ auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0});
+ auto device_id = clblast::ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0});
+ auto platform = clblast::Platform(platform_id);
+ return clblast::Device(platform, device_id);
+}
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// ROTG
+void cblas_srotg(float* sa,
+ float* sb,
+ float* sc,
+ float* ss) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sa_size = 1;
+ const auto sb_size = 1;
+ const auto sc_size = 1;
+ const auto ss_size = 1;
+ auto sa_buffer = clblast::Buffer<float>(context, sa_size);
+ auto sb_buffer = clblast::Buffer<float>(context, sb_size);
+ auto sc_buffer = clblast::Buffer<float>(context, sc_size);
+ auto ss_buffer = clblast::Buffer<float>(context, ss_size);
+ sa_buffer.Write(queue, sa_size, reinterpret_cast<float*>(sa));
+ sb_buffer.Write(queue, sb_size, reinterpret_cast<float*>(sb));
+ sc_buffer.Write(queue, sc_size, reinterpret_cast<float*>(sc));
+ ss_buffer.Write(queue, ss_size, reinterpret_cast<float*>(ss));
+ auto queue_cl = queue();
+ auto s = clblast::Rotg<float>(sa_buffer(), 0,
+ sb_buffer(), 0,
+ sc_buffer(), 0,
+ ss_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sa_buffer.Read(queue, sa_size, reinterpret_cast<float*>(sa));
+ sb_buffer.Read(queue, sb_size, reinterpret_cast<float*>(sb));
+ sc_buffer.Read(queue, sc_size, reinterpret_cast<float*>(sc));
+ ss_buffer.Read(queue, ss_size, reinterpret_cast<float*>(ss));
+}
+void cblas_drotg(double* sa,
+ double* sb,
+ double* sc,
+ double* ss) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sa_size = 1;
+ const auto sb_size = 1;
+ const auto sc_size = 1;
+ const auto ss_size = 1;
+ auto sa_buffer = clblast::Buffer<double>(context, sa_size);
+ auto sb_buffer = clblast::Buffer<double>(context, sb_size);
+ auto sc_buffer = clblast::Buffer<double>(context, sc_size);
+ auto ss_buffer = clblast::Buffer<double>(context, ss_size);
+ sa_buffer.Write(queue, sa_size, reinterpret_cast<double*>(sa));
+ sb_buffer.Write(queue, sb_size, reinterpret_cast<double*>(sb));
+ sc_buffer.Write(queue, sc_size, reinterpret_cast<double*>(sc));
+ ss_buffer.Write(queue, ss_size, reinterpret_cast<double*>(ss));
+ auto queue_cl = queue();
+ auto s = clblast::Rotg<double>(sa_buffer(), 0,
+ sb_buffer(), 0,
+ sc_buffer(), 0,
+ ss_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sa_buffer.Read(queue, sa_size, reinterpret_cast<double*>(sa));
+ sb_buffer.Read(queue, sb_size, reinterpret_cast<double*>(sb));
+ sc_buffer.Read(queue, sc_size, reinterpret_cast<double*>(sc));
+ ss_buffer.Read(queue, ss_size, reinterpret_cast<double*>(ss));
+}
+
+// ROTMG
+void cblas_srotmg(float* sd1,
+ float* sd2,
+ float* sx1,
+ const float sy1,
+ float* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sy1_size = 1;
+ const auto sd1_size = 1;
+ const auto sd2_size = 1;
+ const auto sx1_size = 1;
+ const auto sparam_size = 1;
+ auto sy1_buffer = clblast::Buffer<float>(context, sy1_size);
+ float sy1_vec[1]; sy1_vec[0] = sy1;
+ auto sd1_buffer = clblast::Buffer<float>(context, sd1_size);
+ auto sd2_buffer = clblast::Buffer<float>(context, sd2_size);
+ auto sx1_buffer = clblast::Buffer<float>(context, sx1_size);
+ auto sparam_buffer = clblast::Buffer<float>(context, sparam_size);
+ sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const float*>(sy1_vec));
+ sd1_buffer.Write(queue, sd1_size, reinterpret_cast<float*>(sd1));
+ sd2_buffer.Write(queue, sd2_size, reinterpret_cast<float*>(sd2));
+ sx1_buffer.Write(queue, sx1_size, reinterpret_cast<float*>(sx1));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<float*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotmg<float>(sd1_buffer(), 0,
+ sd2_buffer(), 0,
+ sx1_buffer(), 0,
+ sy1_buffer(), 0,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sd1_buffer.Read(queue, sd1_size, reinterpret_cast<float*>(sd1));
+ sd2_buffer.Read(queue, sd2_size, reinterpret_cast<float*>(sd2));
+ sx1_buffer.Read(queue, sx1_size, reinterpret_cast<float*>(sx1));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<float*>(sparam));
+}
+void cblas_drotmg(double* sd1,
+ double* sd2,
+ double* sx1,
+ const double sy1,
+ double* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto sy1_size = 1;
+ const auto sd1_size = 1;
+ const auto sd2_size = 1;
+ const auto sx1_size = 1;
+ const auto sparam_size = 1;
+ auto sy1_buffer = clblast::Buffer<double>(context, sy1_size);
+ double sy1_vec[1]; sy1_vec[0] = sy1;
+ auto sd1_buffer = clblast::Buffer<double>(context, sd1_size);
+ auto sd2_buffer = clblast::Buffer<double>(context, sd2_size);
+ auto sx1_buffer = clblast::Buffer<double>(context, sx1_size);
+ auto sparam_buffer = clblast::Buffer<double>(context, sparam_size);
+ sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const double*>(sy1_vec));
+ sd1_buffer.Write(queue, sd1_size, reinterpret_cast<double*>(sd1));
+ sd2_buffer.Write(queue, sd2_size, reinterpret_cast<double*>(sd2));
+ sx1_buffer.Write(queue, sx1_size, reinterpret_cast<double*>(sx1));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<double*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotmg<double>(sd1_buffer(), 0,
+ sd2_buffer(), 0,
+ sx1_buffer(), 0,
+ sy1_buffer(), 0,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ sd1_buffer.Read(queue, sd1_size, reinterpret_cast<double*>(sd1));
+ sd2_buffer.Read(queue, sd2_size, reinterpret_cast<double*>(sd2));
+ sx1_buffer.Read(queue, sx1_size, reinterpret_cast<double*>(sx1));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<double*>(sparam));
+}
+
+// ROT
+void cblas_srot(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ const float cos,
+ const float sin) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Rot(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ cos,
+ sin,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_drot(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ const double cos,
+ const double sin) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Rot(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ cos,
+ sin,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// ROTM
+void cblas_srotm(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc,
+ float* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto sparam_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto sparam_buffer = clblast::Buffer<float>(context, sparam_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<float*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotm<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<float*>(sparam));
+}
+void cblas_drotm(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc,
+ double* sparam) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto sparam_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto sparam_buffer = clblast::Buffer<double>(context, sparam_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ sparam_buffer.Write(queue, sparam_size, reinterpret_cast<double*>(sparam));
+ auto queue_cl = queue();
+ auto s = clblast::Rotm<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ sparam_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+ sparam_buffer.Read(queue, sparam_size, reinterpret_cast<double*>(sparam));
+}
+
+// SWAP
+void cblas_sswap(const int n,
+ float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dswap(const int n,
+ double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<float2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zswap(const int n,
+ void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Swap<double2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// SCAL
+void cblas_sscal(const int n,
+ const float alpha,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dscal(const int n,
+ const double alpha,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_cscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_zscal(const int n,
+ const void* alpha,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Scal(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// COPY
+void cblas_scopy(const int n,
+ const float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<float>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dcopy(const int n,
+ const double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<double>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_ccopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<float2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zcopy(const int n,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Copy<double2>(n,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// AXPY
+void cblas_saxpy(const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_daxpy(const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_caxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zaxpy(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Axpy(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// DOT
+float cblas_sdot(const int n,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto dot_buffer = clblast::Buffer<float>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dot<float>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float dot[dot_size];
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float*>(dot));
+ return dot[0];
+}
+double cblas_ddot(const int n,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto dot_buffer = clblast::Buffer<double>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dot<double>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double dot[dot_size];
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double*>(dot));
+ return dot[0];
+}
+
+// DOTU
+void cblas_cdotu_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<float2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotu<float2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
+}
+void cblas_zdotu_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<double2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotu<double2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
+}
+
+// DOTC
+void cblas_cdotc_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<float2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotc<float2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
+}
+void cblas_zdotc_sub(const int n,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* dot) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto dot_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto dot_buffer = clblast::Buffer<double2>(context, dot_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Dotc<double2>(n,
+ dot_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
+}
+
+// NRM2
+float cblas_snrm2(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<float>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<float>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<float*>(nrm2));
+ return nrm2[0];
+}
+double cblas_dnrm2(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<double>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<double>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<double*>(nrm2));
+ return nrm2[0];
+}
+float cblas_scnrm2(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<float2>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<float2>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float2 nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<float2*>(nrm2));
+ return nrm2[0].real();
+}
+double cblas_dznrm2(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto nrm2_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto nrm2_buffer = clblast::Buffer<double2>(context, nrm2_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Nrm2<double2>(n,
+ nrm2_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double2 nrm2[nrm2_size];
+ nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast<double2*>(nrm2));
+ return nrm2[0].real();
+}
+
+// ASUM
+float cblas_sasum(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto asum_buffer = clblast::Buffer<float>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<float>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<float*>(asum));
+ return asum[0];
+}
+double cblas_dasum(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto asum_buffer = clblast::Buffer<double>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<double>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<double*>(asum));
+ return asum[0];
+}
+float cblas_scasum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto asum_buffer = clblast::Buffer<float2>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<float2>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float2 asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<float2*>(asum));
+ return asum[0].real();
+}
+double cblas_dzasum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto asum_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto asum_buffer = clblast::Buffer<double2>(context, asum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Asum<double2>(n,
+ asum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double2 asum[asum_size];
+ asum_buffer.Read(queue, asum_size, reinterpret_cast<double2*>(asum));
+ return asum[0].real();
+}
+
+// SUM
+float cblas_ssum(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto sum_buffer = clblast::Buffer<float>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<float>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<float*>(sum));
+ return sum[0];
+}
+double cblas_dsum(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto sum_buffer = clblast::Buffer<double>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<double>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<double*>(sum));
+ return sum[0];
+}
+float cblas_scsum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto sum_buffer = clblast::Buffer<float2>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<float2>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ float2 sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<float2*>(sum));
+ return sum[0].real();
+}
+double cblas_dzsum(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto sum_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto sum_buffer = clblast::Buffer<double2>(context, sum_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Sum<double2>(n,
+ sum_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ double2 sum[sum_size];
+ sum_buffer.Read(queue, sum_size, reinterpret_cast<double2*>(sum));
+ return sum[0].real();
+}
+
+// AMAX
+int cblas_isamax(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<float>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_idamax(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<double>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_icamax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<float2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_izamax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Amax<double2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+
+// MAX
+int cblas_ismax(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<float>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_idmax(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<double>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_icmax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<float2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+int cblas_izmax(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imax_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto imax_buffer = clblast::Buffer<int>(context, imax_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Max<double2>(n,
+ imax_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imax[imax_size];
+ imax_buffer.Read(queue, imax_size, reinterpret_cast<int*>(imax));
+ return imax[0];
+}
+
+// MIN
+int cblas_ismin(const int n,
+ const float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<float>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+int cblas_idmin(const int n,
+ const double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<double>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+int cblas_icmin(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<float2>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+int cblas_izmin(const int n,
+ const void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto x_size = n * x_inc;
+ const auto imin_size = 1;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto imin_buffer = clblast::Buffer<int>(context, imin_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Min<double2>(n,
+ imin_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ int imin[imin_size];
+ imin_buffer.Read(queue, imin_size, reinterpret_cast<int*>(imin));
+ return imin[0];
+}
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// GEMV
+void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// GBMV
+void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n, const int kl, const int ku,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc;
+ const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n, kl, ku,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HEMV
+void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hemv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HBMV
+void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// HPMV
+void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float2*>(y));
+}
+void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* ap,
+ const void* x, const int x_inc,
+ const void* beta,
+ void* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double2*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Hpmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double2*>(y));
+}
+
+// SYMV
+void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Symv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// SBMV
+void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Sbmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// SPMV
+void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* ap,
+ const float* x, const int x_inc,
+ const float beta,
+ float* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<float*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<float*>(y));
+}
+void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* ap,
+ const double* x, const int x_inc,
+ const double beta,
+ double* y, const int y_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<double*>(y));
+ auto queue_cl = queue();
+ auto s = clblast::Spmv(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ beta_cpp,
+ y_buffer(), 0, y_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ y_buffer.Read(queue, y_size, reinterpret_cast<double*>(y));
+}
+
+// TRMV
+void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TBMV
+void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TPMV
+void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpmv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TRSV
+void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Trsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TBSV
+void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const float* a, const int a_ld,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const double* a, const int a_ld,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n, const int k,
+ const void* a, const int a_ld,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto a_size = n * a_ld;
+ const auto x_size = n * x_inc;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tbsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n, k,
+ a_buffer(), 0, a_ld,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// TPSV
+void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const float* ap,
+ float* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<float>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float*>(x));
+}
+void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const double* ap,
+ double* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<double>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double*>(x));
+}
+void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<float2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<float2*>(x));
+}
+void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int n,
+ const void* ap,
+ void* x, const int x_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto ap_size = ((n*(n+1)) / 2);
+ const auto x_size = n * x_inc;
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
+ x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
+ auto queue_cl = queue();
+ auto s = clblast::Tpsv<double2>(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ n,
+ ap_buffer(), 0,
+ x_buffer(), 0, x_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ x_buffer.Read(queue, x_size, reinterpret_cast<double2*>(x));
+}
+
+// GER
+void cblas_sger(const CLBlastLayout layout,
+ const int m, const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dger(const CLBlastLayout layout,
+ const int m, const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// GERU
+void cblas_cgeru(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zgeru(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Geru(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// GERC
+void cblas_cgerc(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zgerc(const CLBlastLayout layout,
+ const int m, const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = m * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Gerc(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HER
+void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const void* x, const int x_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HPR
+void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const void* x, const int x_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float2*>(ap));
+}
+void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const void* x, const int x_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double2*>(ap));
+}
+
+// HER2
+void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float2*>(a));
+}
+void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Her2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double2*>(a));
+}
+
+// HPR2
+void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto ap_buffer = clblast::Buffer<float2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float2*>(ap));
+}
+void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ void* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto ap_buffer = clblast::Buffer<double2>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Hpr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double2*>(ap));
+}
+
+// SYR
+void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// SPR
+void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ float* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float*>(ap));
+}
+void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ double* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double*>(ap));
+}
+
+// SYR2
+void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<float*>(a));
+}
+void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* a, const int a_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto a_size = n * a_ld;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ a_buffer(), 0, a_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ a_buffer.Read(queue, a_size, reinterpret_cast<double*>(a));
+}
+
+// SPR2
+void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ float* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto ap_buffer = clblast::Buffer<float>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<float*>(ap));
+}
+void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+ const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ double* ap) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto ap_size = ((n*(n+1)) / 2);
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto ap_buffer = clblast::Buffer<double>(context, ap_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
+ auto queue_cl = queue();
+ auto s = clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ ap_buffer(), 0,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ ap_buffer.Read(queue, ap_size, reinterpret_cast<double*>(ap));
+}
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// GEMM
+void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+ const int m, const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Gemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Transpose>(b_transpose),
+ m, n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYMM
+void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Symm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HEMM
+void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld;
+ const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Hemm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYRK
+void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syrk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HERK
+void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const float alpha,
+ const void* a, const int a_ld,
+ const float beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+ const int n, const int k,
+ const double alpha,
+ const void* a, const int a_ld,
+ const double beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Herk(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// SYR2K
+void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const float alpha,
+ const float* a, const int a_ld,
+ const float* b, const int b_ld,
+ const float beta,
+ float* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ auto c_buffer = clblast::Buffer<float>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float*>(c));
+}
+void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const double alpha,
+ const double* a, const int a_ld,
+ const double* b, const int b_ld,
+ const double beta,
+ double* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ auto c_buffer = clblast::Buffer<double>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double*>(c));
+}
+void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const void* beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// HER2K
+void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const float beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ auto c_buffer = clblast::Buffer<float2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<float2*>(c));
+}
+void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+ const int n, const int k,
+ const void* alpha,
+ const void* a, const int a_ld,
+ const void* b, const int b_ld,
+ const double beta,
+ void* c, const int c_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = beta;
+ const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld;
+ const auto c_size = n * c_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ auto c_buffer = clblast::Buffer<double2>(context, c_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
+ c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
+ auto queue_cl = queue();
+ auto s = clblast::Her2k(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(ab_transpose),
+ n, k,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ beta_cpp,
+ c_buffer(), 0, c_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ c_buffer.Read(queue, c_size, reinterpret_cast<double2*>(c));
+}
+
+// TRMM
+void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trmm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// TRSM
+void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
+ const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Trsm(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Side>(side),
+ static_cast<clblast::Triangle>(triangle),
+ static_cast<clblast::Transpose>(a_transpose),
+ static_cast<clblast::Diagonal>(diagonal),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// OMATCOPY
+void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const float alpha,
+ const float* a, const int a_ld,
+ float* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<float>(context, a_size);
+ auto b_buffer = clblast::Buffer<float>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float*>(b));
+}
+void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const double alpha,
+ const double* a, const int a_ld,
+ double* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<double>(context, a_size);
+ auto b_buffer = clblast::Buffer<double>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double*>(b));
+}
+void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<float2>(context, a_size);
+ auto b_buffer = clblast::Buffer<float2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<float2*>(b));
+}
+void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+ const int m, const int n,
+ const void* alpha,
+ const void* a, const int a_ld,
+ void* b, const int b_ld) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
+ const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld;
+ auto a_buffer = clblast::Buffer<double2>(context, a_size);
+ auto b_buffer = clblast::Buffer<double2>(context, b_size);
+ a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
+ b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
+ auto queue_cl = queue();
+ auto s = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha_cpp,
+ a_buffer(), 0, a_ld,
+ b_buffer(), 0, b_ld,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ b_buffer.Read(queue, b_size, reinterpret_cast<double2*>(b));
+}
+
+// =================================================================================================
diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp
index 17410f01..0521b1e5 100644
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@@ -55,12 +55,12 @@ void Xscal<T>::DoScal(const size_t n, const T alpha,
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
+ kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
+ kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index b4a18311..24456252 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -151,6 +151,10 @@ std::string ToString(Precision value) {
case Precision::kComplexDouble: return ToString(static_cast<int>(value))+" (complex-double)";
}
}
+template <>
+std::string ToString(StatusCode value) {
+ return std::to_string(static_cast<int>(value));
+}
// =================================================================================================