Merge pull request #246 from CNugteren/CLBlast-224-hadamard-product

Hadamard product
author: Cedric Nugteren <web@cedricnugteren.nl> 2018-02-03 13:18:03 +0100
committer: GitHub <noreply@github.com> 2018-02-03 13:18:03 +0100
commit: 101152568a9be7d1e1ab80d609b3ee8b692f32ca (patch)
tree: 575c4eef2a3210117a574f25f81662c503ec207d /src
parent: 37c5e8f58c8c6a1f8888938baa67691f8ecddaf4 (diff)
parent: 69ed46c8da69ee18338eca5102ead43410cc01b5 (diff)
8 files changed, 650 insertions, 0 deletions
diff --git a/src/clblast.cpp b/src/clblast.cpp
index c4c51538..331a39ef 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -2109,6 +2109,63 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
 // Extra non-BLAS routines (level-X)
 // =================================================================================================
 
+// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
+template <typename T>
+StatusCode Had(const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               const T beta,
+               cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+               cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xhad<T>(queue_cpp, event);
+    routine.DoHad(n,
+                  alpha,
+                  Buffer<T>(x_buffer), x_offset, x_inc,
+                  Buffer<T>(y_buffer), y_offset, y_inc,
+                  beta,
+                  Buffer<T>(z_buffer), z_offset, z_inc);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Had<float>(const size_t,
+                                          const float,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const float,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<double>(const size_t,
+                                           const double,
+                                           const cl_mem, const size_t, const size_t,
+                                           const cl_mem, const size_t, const size_t,
+                                           const double,
+                                           cl_mem, const size_t, const size_t,
+                                           cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<float2>(const size_t,
+                                           const float2,
+                                           const cl_mem, const size_t, const size_t,
+                                           const cl_mem, const size_t, const size_t,
+                                           const float2,
+                                           cl_mem, const size_t, const size_t,
+                                           cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<double2>(const size_t,
+                                            const double2,
+                                            const cl_mem, const size_t, const size_t,
+                                            const cl_mem, const size_t, const size_t,
+                                            const double2,
+                                            cl_mem, const size_t, const size_t,
+                                            cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Had<half>(const size_t,
+                                         const half,
+                                         const cl_mem, const size_t, const size_t,
+                                         const cl_mem, const size_t, const size_t,
+                                         const half,
+                                         cl_mem, const size_t, const size_t,
+                                         cl_command_queue*, cl_event*);
+
 // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
 template <typename T>
 StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index aa52cbca..f9592f14 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3423,6 +3423,103 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
 // Extra non-BLAS routines (level-X)
 // =================================================================================================
 
+// HAD
+CLBlastStatusCode CLBlastShad(const size_t n,
+                              const float alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const float beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Had(n,
+                   alpha,
+                   x_buffer, x_offset, x_inc,
+                   y_buffer, y_offset, y_inc,
+                   beta,
+                   z_buffer, z_offset, z_inc,
+                   queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDhad(const size_t n,
+                              const double alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const double beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Had(n,
+                   alpha,
+                   x_buffer, x_offset, x_inc,
+                   y_buffer, y_offset, y_inc,
+                   beta,
+                   z_buffer, z_offset, z_inc,
+                   queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastChad(const size_t n,
+                              const cl_float2 alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const cl_float2 beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Had(n,
+                   float2{alpha.s[0], alpha.s[1]},
+                   x_buffer, x_offset, x_inc,
+                   y_buffer, y_offset, y_inc,
+                   float2{beta.s[0], beta.s[1]},
+                   z_buffer, z_offset, z_inc,
+                   queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZhad(const size_t n,
+                              const cl_double2 alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const cl_double2 beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Had(n,
+                   double2{alpha.s[0], alpha.s[1]},
+                   x_buffer, x_offset, x_inc,
+                   y_buffer, y_offset, y_inc,
+                   double2{beta.s[0], beta.s[1]},
+                   z_buffer, z_offset, z_inc,
+                   queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHhad(const size_t n,
+                              const cl_half alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const cl_half beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Had(n,
+                   alpha,
+                   x_buffer, x_offset, x_inc,
+                   y_buffer, y_offset, y_inc,
+                   beta,
+                   z_buffer, z_offset, z_inc,
+                   queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
 // OMATCOPY
 CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                                    const size_t m, const size_t n,
diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp
index 0aa57087..0ba57056 100644
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
@@ -2201,6 +2201,65 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
 // Extra non-BLAS routines (level-X)
 // =================================================================================================
 
+// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
+template <typename T>
+StatusCode Had(const size_t n,
+               const T alpha,
+               const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+               const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+               const T beta,
+               CUdeviceptr z_buffer, const size_t z_offset, const size_t z_inc,
+               const CUcontext context, const CUdevice device) {
+  try {
+    const auto context_cpp = Context(context);
+    const auto device_cpp = Device(device);
+    auto queue_cpp = Queue(context_cpp, device_cpp);
+    auto routine = Xhad<T>(queue_cpp, nullptr);
+    routine.DoHad(n,
+                  alpha,
+                  Buffer<T>(x_buffer), x_offset, x_inc,
+                  Buffer<T>(y_buffer), y_offset, y_inc,
+                  beta,
+                  Buffer<T>(z_buffer), z_offset, z_inc);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Had<float>(const size_t,
+                                          const float,
+                                          const CUdeviceptr, const size_t, const size_t,
+                                          const CUdeviceptr, const size_t, const size_t,
+                                          const float,
+                                          CUdeviceptr, const size_t, const size_t,
+                                          const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<double>(const size_t,
+                                           const double,
+                                           const CUdeviceptr, const size_t, const size_t,
+                                           const CUdeviceptr, const size_t, const size_t,
+                                           const double,
+                                           CUdeviceptr, const size_t, const size_t,
+                                           const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<float2>(const size_t,
+                                           const float2,
+                                           const CUdeviceptr, const size_t, const size_t,
+                                           const CUdeviceptr, const size_t, const size_t,
+                                           const float2,
+                                           CUdeviceptr, const size_t, const size_t,
+                                           const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<double2>(const size_t,
+                                            const double2,
+                                            const CUdeviceptr, const size_t, const size_t,
+                                            const CUdeviceptr, const size_t, const size_t,
+                                            const double2,
+                                            CUdeviceptr, const size_t, const size_t,
+                                            const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Had<half>(const size_t,
+                                         const half,
+                                         const CUdeviceptr, const size_t, const size_t,
+                                         const CUdeviceptr, const size_t, const size_t,
+                                         const half,
+                                         CUdeviceptr, const size_t, const size_t,
+                                         const CUcontext, const CUdevice);
+
 // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
 template <typename T>
 StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp
index 7859dddf..9ab663be 100644
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
@@ -4621,6 +4621,140 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
 // Extra non-BLAS routines (level-X)
 // =================================================================================================
 
+// HAD
+void cblas_shad(const int n,
+                const float alpha,
+                const float* x, const int x_inc,
+                const float* y, const int y_inc,
+                const float beta,
+                float* z, const int z_inc) {
+  auto device = get_device();
+  auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto alpha_cpp = alpha;
+  const auto beta_cpp = beta;
+  const auto x_size = n * x_inc;
+  const auto y_size = n * y_inc;
+  const auto z_size = n * z_inc;
+  auto x_buffer = clblast::Buffer<float>(context, x_size);
+  auto y_buffer = clblast::Buffer<float>(context, y_size);
+  auto z_buffer = clblast::Buffer<float>(context, z_size);
+  x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
+  y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
+  z_buffer.Write(queue, z_size, reinterpret_cast<float*>(z));
+  auto queue_cl = queue();
+  auto s = clblast::Had(n,
+                        alpha_cpp,
+                        x_buffer(), 0, x_inc,
+                        y_buffer(), 0, y_inc,
+                        beta_cpp,
+                        z_buffer(), 0, z_inc,
+                        &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  z_buffer.Read(queue, z_size, reinterpret_cast<float*>(z));
+}
+void cblas_dhad(const int n,
+                const double alpha,
+                const double* x, const int x_inc,
+                const double* y, const int y_inc,
+                const double beta,
+                double* z, const int z_inc) {
+  auto device = get_device();
+  auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto alpha_cpp = alpha;
+  const auto beta_cpp = beta;
+  const auto x_size = n * x_inc;
+  const auto y_size = n * y_inc;
+  const auto z_size = n * z_inc;
+  auto x_buffer = clblast::Buffer<double>(context, x_size);
+  auto y_buffer = clblast::Buffer<double>(context, y_size);
+  auto z_buffer = clblast::Buffer<double>(context, z_size);
+  x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
+  y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
+  z_buffer.Write(queue, z_size, reinterpret_cast<double*>(z));
+  auto queue_cl = queue();
+  auto s = clblast::Had(n,
+                        alpha_cpp,
+                        x_buffer(), 0, x_inc,
+                        y_buffer(), 0, y_inc,
+                        beta_cpp,
+                        z_buffer(), 0, z_inc,
+                        &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  z_buffer.Read(queue, z_size, reinterpret_cast<double*>(z));
+}
+void cblas_chad(const int n,
+                const void* alpha,
+                const void* x, const int x_inc,
+                const void* y, const int y_inc,
+                const void* beta,
+                void* z, const int z_inc) {
+  auto device = get_device();
+  auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
+  const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
+  const auto x_size = n * x_inc;
+  const auto y_size = n * y_inc;
+  const auto z_size = n * z_inc;
+  auto x_buffer = clblast::Buffer<float2>(context, x_size);
+  auto y_buffer = clblast::Buffer<float2>(context, y_size);
+  auto z_buffer = clblast::Buffer<float2>(context, z_size);
+  x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
+  y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
+  z_buffer.Write(queue, z_size, reinterpret_cast<float2*>(z));
+  auto queue_cl = queue();
+  auto s = clblast::Had(n,
+                        alpha_cpp,
+                        x_buffer(), 0, x_inc,
+                        y_buffer(), 0, y_inc,
+                        beta_cpp,
+                        z_buffer(), 0, z_inc,
+                        &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  z_buffer.Read(queue, z_size, reinterpret_cast<float2*>(z));
+}
+void cblas_zhad(const int n,
+                const void* alpha,
+                const void* x, const int x_inc,
+                const void* y, const int y_inc,
+                const void* beta,
+                void* z, const int z_inc) {
+  auto device = get_device();
+  auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
+  const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
+  const auto x_size = n * x_inc;
+  const auto y_size = n * y_inc;
+  const auto z_size = n * z_inc;
+  auto x_buffer = clblast::Buffer<double2>(context, x_size);
+  auto y_buffer = clblast::Buffer<double2>(context, y_size);
+  auto z_buffer = clblast::Buffer<double2>(context, z_size);
+  x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
+  y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
+  z_buffer.Write(queue, z_size, reinterpret_cast<double2*>(z));
+  auto queue_cl = queue();
+  auto s = clblast::Had(n,
+                        alpha_cpp,
+                        x_buffer(), 0, x_inc,
+                        y_buffer(), 0, y_inc,
+                        beta_cpp,
+                        z_buffer(), 0, z_inc,
+                        &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  z_buffer.Read(queue, z_size, reinterpret_cast<double2*>(z));
+}
+
 // OMATCOPY
 void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
                      const int m, const int n,
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
new file mode 100644
index 00000000..3880b7a4
--- /dev/null
+++ b/src/kernels/level1/xhad.opencl
@@ -0,0 +1,145 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xhad kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general,
+// but doesn't support vector data-types. Based on the XAXPY kernels.
+//
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// A vector-vector multiply function. See also level1.opencl for a vector-scalar version
+INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) {
+  #if VW == 1
+    Multiply(cvec, aval, bvec);
+  #elif VW == 2
+    Multiply(cvec.x, aval.x, bvec.x);
+    Multiply(cvec.y, aval.y, bvec.y);
+  #elif VW == 4
+    Multiply(cvec.x, aval.x, bvec.x);
+    Multiply(cvec.y, aval.y, bvec.y);
+    Multiply(cvec.z, aval.z, bvec.z);
+    Multiply(cvec.w, aval.w, bvec.w);
+  #elif VW == 8
+    Multiply(cvec.s0, aval.s0, bvec.s0);
+    Multiply(cvec.s1, aval.s1, bvec.s1);
+    Multiply(cvec.s2, aval.s2, bvec.s2);
+    Multiply(cvec.s3, aval.s3, bvec.s3);
+    Multiply(cvec.s4, aval.s4, bvec.s4);
+    Multiply(cvec.s5, aval.s5, bvec.s5);
+    Multiply(cvec.s6, aval.s6, bvec.s6);
+    Multiply(cvec.s7, aval.s7, bvec.s7);
+  #elif VW == 16
+    Multiply(cvec.s0, aval.s0, bvec.s0);
+    Multiply(cvec.s1, aval.s1, bvec.s1);
+    Multiply(cvec.s2, aval.s2, bvec.s2);
+    Multiply(cvec.s3, aval.s3, bvec.s3);
+    Multiply(cvec.s4, aval.s4, bvec.s4);
+    Multiply(cvec.s5, aval.s5, bvec.s5);
+    Multiply(cvec.s6, aval.s6, bvec.s6);
+    Multiply(cvec.s7, aval.s7, bvec.s7);
+    Multiply(cvec.s8, aval.s8, bvec.s8);
+    Multiply(cvec.s9, aval.s9, bvec.s9);
+    Multiply(cvec.sA, aval.sA, bvec.sA);
+    Multiply(cvec.sB, aval.sB, bvec.sB);
+    Multiply(cvec.sC, aval.sC, bvec.sC);
+    Multiply(cvec.sD, aval.sD, bvec.sD);
+    Multiply(cvec.sE, aval.sE, bvec.sE);
+    Multiply(cvec.sF, aval.sF, bvec.sF);
+  #endif
+  return cvec;
+}
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* restrict ygm, const int y_offset, const int y_inc,
+          __global real* zgm, const int z_offset, const int z_inc) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
+    real xvalue = xgm[id*x_inc + x_offset];
+    real yvalue = ygm[id*y_inc + y_offset];
+    real zvalue = zgm[id*z_inc + z_offset];
+    real result;
+    real alpha_times_x;
+    Multiply(alpha_times_x, alpha, xvalue);
+    Multiply(result, alpha_times_x, yvalue);
+    MultiplyAdd(result, beta, zvalue);
+    zgm[id*z_inc + z_offset] = result;
+  }
+}
+
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+                const __global realV* restrict xgm, const __global realV* restrict ygm,
+                __global realV* zgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  if (get_global_id(0) < n / (VW)) {
+    #pragma unroll
+    for (int _w = 0; _w < WPT; _w += 1) {
+      const int id = _w*get_global_size(0) + get_global_id(0);
+      realV xvalue = xgm[id];
+      realV yvalue = ygm[id];
+      realV zvalue = zgm[id];
+      realV result;
+      realV alpha_times_x;
+      alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+      result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+      zgm[id] = MultiplyAddVector(result, beta, zvalue);
+    }
+  }
+}
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+                 const __global realV* restrict xgm, const __global realV* restrict ygm,
+                 __global realV* zgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  #pragma unroll
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
+    realV xvalue = xgm[id];
+    realV yvalue = ygm[id];
+    realV zvalue = zgm[id];
+    realV result;
+    realV alpha_times_x;
+    alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+    result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+    zgm[id] = MultiplyAddVector(result, beta, zvalue);
+  }
+}
+
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/levelx/xhad.cpp b/src/routines/levelx/xhad.cpp
new file mode 100644
index 00000000..da416cc7
--- /dev/null
+++ b/src/routines/levelx/xhad.cpp
@@ -0,0 +1,116 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhad class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xhad.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhad<T>::Xhad(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
+#include "../../kernels/level1/level1.opencl"
+#include "../../kernels/level1/xhad.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xhad<T>::DoHad(const size_t n, const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const T beta,
+                    const Buffer<T> &z_buffer, const size_t z_offset, const size_t z_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Tests the vectors for validity
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);
+  TestVectorY(n, z_buffer, z_offset, z_inc); // TODO: Make a TestVectorZ function with error codes
+
+  // Determines whether or not the fast-version can be used
+  const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+                                 (y_offset == 0) && (y_inc == 1) &&
+                                 (z_offset == 0) && (z_inc == 1) &&
+                                 IsMultiple(n, db_["WPT"]*db_["VW"]);
+  const auto use_fastest_kernel = use_faster_kernel &&
+                                  IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+
+  // If possible, run the fast-version of the kernel
+  const auto kernel_name = (use_fastest_kernel) ? "XhadFastest" :
+                           (use_faster_kernel) ? "XhadFaster" : "Xhad";
+
+  // Retrieves the Xhad kernel from the compiled binary
+  auto kernel = Kernel(program_, kernel_name);
+
+  // Sets the kernel arguments
+  if (use_faster_kernel || use_fastest_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, GetRealArg(beta));
+    kernel.SetArgument(3, x_buffer());
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, z_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, GetRealArg(beta));
+    kernel.SetArgument(3, x_buffer());
+    kernel.SetArgument(4, static_cast<int>(x_offset));
+    kernel.SetArgument(5, static_cast<int>(x_inc));
+    kernel.SetArgument(6, y_buffer());
+    kernel.SetArgument(7, static_cast<int>(y_offset));
+    kernel.SetArgument(8, static_cast<int>(y_inc));
+    kernel.SetArgument(9, z_buffer());
+    kernel.SetArgument(10, static_cast<int>(z_offset));
+    kernel.SetArgument(11, static_cast<int>(z_inc));
+  }
+
+  // Launches the kernel
+  if (use_fastest_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else if (use_faster_kernel) {
+    auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhad<half>;
+template class Xhad<float>;
+template class Xhad<double>;
+template class Xhad<float2>;
+template class Xhad<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xhad.hpp b/src/routines/levelx/xhad.hpp
new file mode 100644
index 00000000..eb3e1c3e
--- /dev/null
+++ b/src/routines/levelx/xhad.hpp
@@ -0,0 +1,41 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhad routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHAD_H_
+#define CLBLAST_ROUTINES_XHAD_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhad: public Routine {
+public:
+
+  // Constructor
+  Xhad(Queue &queue, EventPointer event, const std::string &name = "HAD");
+
+  // Templated-precision implementation of the routine
+  void DoHad(const size_t n, const T alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const T beta,
+             const Buffer<T> &z_buffer, const size_t z_offset, const size_t z_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHAD_H_
+#endif
diff --git a/src/routines/routines.hpp b/src/routines/routines.hpp
index 0aeff707..2ab16a75 100644
--- a/src/routines/routines.hpp
+++ b/src/routines/routines.hpp
@@ -67,6 +67,7 @@
 #include "routines/level3/xtrsm.hpp"
 
 // Level-x includes (non-BLAS)
+#include "routines/levelx/xhad.hpp"
 #include "routines/levelx/xomatcopy.hpp"
 #include "routines/levelx/xim2col.hpp"
 #include "routines/levelx/xaxpybatched.hpp"
author	Cedric Nugteren <web@cedricnugteren.nl>	2018-02-03 13:18:03 +0100
committer	GitHub <noreply@github.com>	2018-02-03 13:18:03 +0100
commit	101152568a9be7d1e1ab80d609b3ee8b692f32ca (patch)
tree	575c4eef2a3210117a574f25f81662c503ec207d /src
parent	37c5e8f58c8c6a1f8888938baa67691f8ecddaf4 (diff)
parent	69ed46c8da69ee18338eca5102ead43410cc01b5 (diff)