summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/clblast.cpp57
-rw-r--r--src/clblast_c.cpp97
-rw-r--r--src/clblast_cuda.cpp59
-rw-r--r--src/clblast_netlib_c.cpp134
-rw-r--r--src/routines/levelx/xhad.cpp60
-rw-r--r--src/routines/levelx/xhad.hpp41
-rw-r--r--src/routines/routines.hpp1
7 files changed, 449 insertions, 0 deletions
diff --git a/src/clblast.cpp b/src/clblast.cpp
index c4c51538..331a39ef 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -2109,6 +2109,63 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
// Extra non-BLAS routines (level-X)
// =================================================================================================
+// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
+template <typename T>
+StatusCode Had(const size_t n,
+ const T alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const T beta,
+ cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhad<T>(queue_cpp, event);
+ routine.DoHad(n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ beta,
+ Buffer<T>(z_buffer), z_offset, z_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Had<float>(const size_t,
+ const float,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ const float,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<double>(const size_t,
+ const double,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ const double,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<float2>(const size_t,
+ const float2,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ const float2,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<double2>(const size_t,
+ const double2,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ const double2,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<half>(const size_t,
+ const half,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ const half,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
template <typename T>
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index aa52cbca..f9592f14 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3423,6 +3423,103 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
// Extra non-BLAS routines (level-X)
// =================================================================================================
+// HAD
+CLBlastStatusCode CLBlastShad(const size_t n,
+ const float alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const float beta,
+ cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Had(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ beta,
+ z_buffer, z_offset, z_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDhad(const size_t n,
+ const double alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const double beta,
+ cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Had(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ beta,
+ z_buffer, z_offset, z_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastChad(const size_t n,
+ const cl_float2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const cl_float2 beta,
+ cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Had(n,
+ float2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ float2{beta.s[0], beta.s[1]},
+ z_buffer, z_offset, z_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhad(const size_t n,
+ const cl_double2 alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const cl_double2 beta,
+ cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Had(n,
+ double2{alpha.s[0], alpha.s[1]},
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ double2{beta.s[0], beta.s[1]},
+ z_buffer, z_offset, z_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHhad(const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ const cl_half beta,
+ cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+ cl_command_queue* queue, cl_event* event) {
+ try {
+ return static_cast<CLBlastStatusCode>(
+ clblast::Had(n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ beta,
+ z_buffer, z_offset, z_inc,
+ queue, event)
+ );
+ } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
// OMATCOPY
CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const size_t m, const size_t n,
diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp
index 0aa57087..0ba57056 100644
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
@@ -2201,6 +2201,65 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
// Extra non-BLAS routines (level-X)
// =================================================================================================
+// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
+template <typename T>
+StatusCode Had(const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ const T beta,
+ CUdeviceptr z_buffer, const size_t z_offset, const size_t z_inc,
+ const CUcontext context, const CUdevice device) {
+ try {
+ const auto context_cpp = Context(context);
+ const auto device_cpp = Device(device);
+ auto queue_cpp = Queue(context_cpp, device_cpp);
+ auto routine = Xhad<T>(queue_cpp, nullptr);
+ routine.DoHad(n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ beta,
+ Buffer<T>(z_buffer), z_offset, z_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Had<float>(const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<double>(const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<float2>(const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<double2>(const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<half>(const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ const CUcontext, const CUdevice);
+
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
template <typename T>
StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp
index 7859dddf..9ab663be 100644
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
@@ -4621,6 +4621,140 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
// Extra non-BLAS routines (level-X)
// =================================================================================================
+// HAD
+void cblas_shad(const int n,
+ const float alpha,
+ const float* x, const int x_inc,
+ const float* y, const int y_inc,
+ const float beta,
+ float* z, const int z_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto z_size = n * z_inc;
+ auto x_buffer = clblast::Buffer<float>(context, x_size);
+ auto y_buffer = clblast::Buffer<float>(context, y_size);
+ auto z_buffer = clblast::Buffer<float>(context, z_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+ z_buffer.Write(queue, z_size, reinterpret_cast<float*>(z));
+ auto queue_cl = queue();
+ auto s = clblast::Had(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ beta_cpp,
+ z_buffer(), 0, z_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ z_buffer.Read(queue, z_size, reinterpret_cast<float*>(z));
+}
+void cblas_dhad(const int n,
+ const double alpha,
+ const double* x, const int x_inc,
+ const double* y, const int y_inc,
+ const double beta,
+ double* z, const int z_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = alpha;
+ const auto beta_cpp = beta;
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto z_size = n * z_inc;
+ auto x_buffer = clblast::Buffer<double>(context, x_size);
+ auto y_buffer = clblast::Buffer<double>(context, y_size);
+ auto z_buffer = clblast::Buffer<double>(context, z_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+ z_buffer.Write(queue, z_size, reinterpret_cast<double*>(z));
+ auto queue_cl = queue();
+ auto s = clblast::Had(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ beta_cpp,
+ z_buffer(), 0, z_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ z_buffer.Read(queue, z_size, reinterpret_cast<double*>(z));
+}
+void cblas_chad(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ const void* beta,
+ void* z, const int z_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+ const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto z_size = n * z_inc;
+ auto x_buffer = clblast::Buffer<float2>(context, x_size);
+ auto y_buffer = clblast::Buffer<float2>(context, y_size);
+ auto z_buffer = clblast::Buffer<float2>(context, z_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+ z_buffer.Write(queue, z_size, reinterpret_cast<float2*>(z));
+ auto queue_cl = queue();
+ auto s = clblast::Had(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ beta_cpp,
+ z_buffer(), 0, z_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ z_buffer.Read(queue, z_size, reinterpret_cast<float2*>(z));
+}
+void cblas_zhad(const int n,
+ const void* alpha,
+ const void* x, const int x_inc,
+ const void* y, const int y_inc,
+ const void* beta,
+ void* z, const int z_inc) {
+ auto device = get_device();
+ auto context = clblast::Context(device);
+ auto queue = clblast::Queue(context, device);
+ const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+ const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+ const auto x_size = n * x_inc;
+ const auto y_size = n * y_inc;
+ const auto z_size = n * z_inc;
+ auto x_buffer = clblast::Buffer<double2>(context, x_size);
+ auto y_buffer = clblast::Buffer<double2>(context, y_size);
+ auto z_buffer = clblast::Buffer<double2>(context, z_size);
+ x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+ y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+ z_buffer.Write(queue, z_size, reinterpret_cast<double2*>(z));
+ auto queue_cl = queue();
+ auto s = clblast::Had(n,
+ alpha_cpp,
+ x_buffer(), 0, x_inc,
+ y_buffer(), 0, y_inc,
+ beta_cpp,
+ z_buffer(), 0, z_inc,
+ &queue_cl);
+ if (s != clblast::StatusCode::kSuccess) {
+ throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+ }
+ z_buffer.Read(queue, z_size, reinterpret_cast<double2*>(z));
+}
+
// OMATCOPY
void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
diff --git a/src/routines/levelx/xhad.cpp b/src/routines/levelx/xhad.cpp
new file mode 100644
index 00000000..46ae8031
--- /dev/null
+++ b/src/routines/levelx/xhad.cpp
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhad class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xhad.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhad<T>::Xhad(Queue &queue, EventPointer event, const std::string &name):
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
+#include "../../kernels/level1/level1.opencl"
+#include "../../kernels/level1/xaxpy.opencl"
+ }) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xhad<T>::DoHad(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const T beta,
+ const Buffer<T> &z_buffer, const size_t z_offset, const size_t z_inc) {
+
+ // Makes sure all dimensions are larger than zero
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+ // Tests the vectors for validity
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
+ TestVectorY(n, z_buffer, z_offset, z_inc); // TODO: Make a TestVectorZ function with error codes
+
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhad<half>;
+template class Xhad<float>;
+template class Xhad<double>;
+template class Xhad<float2>;
+template class Xhad<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xhad.hpp b/src/routines/levelx/xhad.hpp
new file mode 100644
index 00000000..eb3e1c3e
--- /dev/null
+++ b/src/routines/levelx/xhad.hpp
@@ -0,0 +1,41 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhad routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHAD_H_
+#define CLBLAST_ROUTINES_XHAD_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhad: public Routine {
+public:
+
+ // Constructor
+ Xhad(Queue &queue, EventPointer event, const std::string &name = "HAD");
+
+ // Templated-precision implementation of the routine
+ void DoHad(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const T beta,
+ const Buffer<T> &z_buffer, const size_t z_offset, const size_t z_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHAD_H_
+#endif
diff --git a/src/routines/routines.hpp b/src/routines/routines.hpp
index 0aeff707..2ab16a75 100644
--- a/src/routines/routines.hpp
+++ b/src/routines/routines.hpp
@@ -67,6 +67,7 @@
#include "routines/level3/xtrsm.hpp"
// Level-x includes (non-BLAS)
+#include "routines/levelx/xhad.hpp"
#include "routines/levelx/xomatcopy.hpp"
#include "routines/levelx/xim2col.hpp"
#include "routines/levelx/xaxpybatched.hpp"