6 files changed, 241 insertions, 17 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 8208b77a..13b20d9d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,8 @@
 
+Development (next version)
+- Added non-BLAS level-1 routines:
+  * SHAD/DHAD/CHAD/ZHAD/HHAD (Hadamard element-wise vector-vector product)
+
 Version 1.3.0
 - Re-designed and integrated the auto-tuner, no more dependency on CLTune
 - Made it possible to override the tuning parameters in the clients straight from JSON tuning files
diff --git a/README.md b/README.md
index 4fc6044d..f05889bb 100644
--- a/README.md
+++ b/README.md
@@ -319,6 +319,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
 | IxAMIN     | ✔ | ✔ | ✔ | ✔ | ✔ |
 | IxMAX      | ✔ | ✔ | ✔ | ✔ | ✔ |
 | IxMIN      | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHAD       | ✔ | ✔ | ✔ | ✔ | ✔ | (Hadamard product)
 | xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xIM2COL    | ✔ | ✔ | ✔ | ✔ | ✔ |
 
diff --git a/ROADMAP.md b/ROADMAP.md
index 33775034..df42f75c 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -13,7 +13,7 @@ This file gives an overview of the main features planned for addition to CLBlast
 | [#207](https://github.com/CNugteren/CLBlast/issues/207)        | Dec '17     | CNugteren | ✔      | Tuning of the TRSM/TRSV routines |
 | [#195](https://github.com/CNugteren/CLBlast/issues/195)        | Jan '18     | CNugteren | ✔      | Extra GEMM API with pre-allocated temporary buffer |
 | [#95](https://github.com/CNugteren/CLBlast/issues/95)   & #237 | Jan '18     | CNugteren | ✔      | Implement strided batch GEMM |
-| [#224](https://github.com/CNugteren/CLBlast/issues/224)        | Jan-Feb '18 | CNugteren |        | Implement Hadamard product (element-wise vector-vector product) |
+| [#224](https://github.com/CNugteren/CLBlast/issues/224)        | Jan-Feb '18 | CNugteren | ✔      | Implement Hadamard product (element-wise vector-vector product) |
 | [#233](https://github.com/CNugteren/CLBlast/issues/233)        | Feb '18     | CNugteren |        | Add CLBlast to common package managers |
 | [#223](https://github.com/CNugteren/CLBlast/issues/223)        | Feb '18     | CNugteren |        | Python OpenCL interface |
 | [#169](https://github.com/CNugteren/CLBlast/issues/169)        | ??          | dividiti  |        | Problem-specific tuning parameter selection |
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
new file mode 100644
index 00000000..3880b7a4
--- /dev/null
+++ b/src/kernels/level1/xhad.opencl
@@ -0,0 +1,145 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xhad kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general,
+// but doesn't support vector data-types. Based on the XAXPY kernels.
+//
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// A vector-vector multiply function. See also level1.opencl for a vector-scalar version
+INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) {
+  #if VW == 1
+    Multiply(cvec, aval, bvec);
+  #elif VW == 2
+    Multiply(cvec.x, aval.x, bvec.x);
+    Multiply(cvec.y, aval.y, bvec.y);
+  #elif VW == 4
+    Multiply(cvec.x, aval.x, bvec.x);
+    Multiply(cvec.y, aval.y, bvec.y);
+    Multiply(cvec.z, aval.z, bvec.z);
+    Multiply(cvec.w, aval.w, bvec.w);
+  #elif VW == 8
+    Multiply(cvec.s0, aval.s0, bvec.s0);
+    Multiply(cvec.s1, aval.s1, bvec.s1);
+    Multiply(cvec.s2, aval.s2, bvec.s2);
+    Multiply(cvec.s3, aval.s3, bvec.s3);
+    Multiply(cvec.s4, aval.s4, bvec.s4);
+    Multiply(cvec.s5, aval.s5, bvec.s5);
+    Multiply(cvec.s6, aval.s6, bvec.s6);
+    Multiply(cvec.s7, aval.s7, bvec.s7);
+  #elif VW == 16
+    Multiply(cvec.s0, aval.s0, bvec.s0);
+    Multiply(cvec.s1, aval.s1, bvec.s1);
+    Multiply(cvec.s2, aval.s2, bvec.s2);
+    Multiply(cvec.s3, aval.s3, bvec.s3);
+    Multiply(cvec.s4, aval.s4, bvec.s4);
+    Multiply(cvec.s5, aval.s5, bvec.s5);
+    Multiply(cvec.s6, aval.s6, bvec.s6);
+    Multiply(cvec.s7, aval.s7, bvec.s7);
+    Multiply(cvec.s8, aval.s8, bvec.s8);
+    Multiply(cvec.s9, aval.s9, bvec.s9);
+    Multiply(cvec.sA, aval.sA, bvec.sA);
+    Multiply(cvec.sB, aval.sB, bvec.sB);
+    Multiply(cvec.sC, aval.sC, bvec.sC);
+    Multiply(cvec.sD, aval.sD, bvec.sD);
+    Multiply(cvec.sE, aval.sE, bvec.sE);
+    Multiply(cvec.sF, aval.sF, bvec.sF);
+  #endif
+  return cvec;
+}
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* restrict ygm, const int y_offset, const int y_inc,
+          __global real* zgm, const int z_offset, const int z_inc) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
+    real xvalue = xgm[id*x_inc + x_offset];
+    real yvalue = ygm[id*y_inc + y_offset];
+    real zvalue = zgm[id*z_inc + z_offset];
+    real result;
+    real alpha_times_x;
+    Multiply(alpha_times_x, alpha, xvalue);
+    Multiply(result, alpha_times_x, yvalue);
+    MultiplyAdd(result, beta, zvalue);
+    zgm[id*z_inc + z_offset] = result;
+  }
+}
+
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+                const __global realV* restrict xgm, const __global realV* restrict ygm,
+                __global realV* zgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  if (get_global_id(0) < n / (VW)) {
+    #pragma unroll
+    for (int _w = 0; _w < WPT; _w += 1) {
+      const int id = _w*get_global_size(0) + get_global_id(0);
+      realV xvalue = xgm[id];
+      realV yvalue = ygm[id];
+      realV zvalue = zgm[id];
+      realV result;
+      realV alpha_times_x;
+      alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+      result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+      zgm[id] = MultiplyAddVector(result, beta, zvalue);
+    }
+  }
+}
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+                 const __global realV* restrict xgm, const __global realV* restrict ygm,
+                 __global realV* zgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  #pragma unroll
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
+    realV xvalue = xgm[id];
+    realV yvalue = ygm[id];
+    realV zvalue = zgm[id];
+    realV result;
+    realV alpha_times_x;
+    alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+    result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+    zgm[id] = MultiplyAddVector(result, beta, zvalue);
+  }
+}
+
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/levelx/xhad.cpp b/src/routines/levelx/xhad.cpp
index 46ae8031..da416cc7 100644
--- a/src/routines/levelx/xhad.cpp
+++ b/src/routines/levelx/xhad.cpp
@@ -24,7 +24,7 @@ template <typename T>
 Xhad<T>::Xhad(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
 #include "../../kernels/level1/level1.opencl"
-#include "../../kernels/level1/xaxpy.opencl"
+#include "../../kernels/level1/xhad.opencl"
     }) {
 }
 
@@ -45,6 +45,62 @@ void Xhad<T>::DoHad(const size_t n, const T alpha,
   TestVectorY(n, y_buffer, y_offset, y_inc);
   TestVectorY(n, z_buffer, z_offset, z_inc); // TODO: Make a TestVectorZ function with error codes
 
+  // Determines whether or not the fast-version can be used
+  const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+                                 (y_offset == 0) && (y_inc == 1) &&
+                                 (z_offset == 0) && (z_inc == 1) &&
+                                 IsMultiple(n, db_["WPT"]*db_["VW"]);
+  const auto use_fastest_kernel = use_faster_kernel &&
+                                  IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+
+  // If possible, run the fast-version of the kernel
+  const auto kernel_name = (use_fastest_kernel) ? "XhadFastest" :
+                           (use_faster_kernel) ? "XhadFaster" : "Xhad";
+
+  // Retrieves the Xhad kernel from the compiled binary
+  auto kernel = Kernel(program_, kernel_name);
+
+  // Sets the kernel arguments
+  if (use_faster_kernel || use_fastest_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, GetRealArg(beta));
+    kernel.SetArgument(3, x_buffer());
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, z_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, GetRealArg(beta));
+    kernel.SetArgument(3, x_buffer());
+    kernel.SetArgument(4, static_cast<int>(x_offset));
+    kernel.SetArgument(5, static_cast<int>(x_inc));
+    kernel.SetArgument(6, y_buffer());
+    kernel.SetArgument(7, static_cast<int>(y_offset));
+    kernel.SetArgument(8, static_cast<int>(y_inc));
+    kernel.SetArgument(9, z_buffer());
+    kernel.SetArgument(10, static_cast<int>(z_offset));
+    kernel.SetArgument(11, static_cast<int>(z_inc));
+  }
+
+  // Launches the kernel
+  if (use_fastest_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else if (use_faster_kernel) {
+    auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }
 
 // =================================================================================================
diff --git a/test/routines/levelx/xhad.hpp b/test/routines/levelx/xhad.hpp
index fc47a7d6..3e40de87 100644
--- a/test/routines/levelx/xhad.hpp
+++ b/test/routines/levelx/xhad.hpp
@@ -21,12 +21,43 @@
 namespace clblast {
 // =================================================================================================
 
+template <typename T>
+StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
+  for (auto index = size_t{0}; index < args.n; ++index) {
+    const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset];
+    const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset];
+    const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset];
+    buffers_host.c_mat[index] = args.alpha * x * y + args.beta * z;
+  }
+  return StatusCode::kSuccess;
+}
+
+// Half-precision version calling the above reference implementation after conversions
+template <>
+StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) {
+  auto x_buffer2 = HalfToFloatBuffer(buffers_host.x_vec);
+  auto y_buffer2 = HalfToFloatBuffer(buffers_host.y_vec);
+  auto c_buffer2 = HalfToFloatBuffer(buffers_host.c_mat);
+  auto dummy = std::vector<float>(0);
+  auto buffers2 = BuffersHost<float>{x_buffer2, y_buffer2, dummy, dummy, c_buffer2, dummy, dummy};
+  auto args2 = Arguments<float>();
+  args2.x_size = args.x_size; args2.y_size = args.y_size; args2.c_size = args.c_size;
+  args2.x_inc = args.x_inc; args2.y_inc = args.y_inc; args2.n = args.n;
+  args2.x_offset = args.x_offset; args2.y_offset = args.y_offset;
+  args2.alpha = HalfToFloat(args.alpha); args2.beta = HalfToFloat(args.beta);
+  auto status = RunReference(args2, buffers2);
+  FloatToHalfBuffer(buffers_host.c_mat, buffers2.c_mat);
+  return status;
+}
+
+// =================================================================================================
+
 // See comment at top of file for a description of the class
 template <typename T>
 class TestXhad {
 public:
 
-  // The BLAS level: 4 for the extra routines
+  // The BLAS level: 4 for the extra routines (note: tested with matrix-size values for 'n')
   static size_t BLASLevel() { return 4; }
 
   // The list of arguments relevant for this routine
@@ -34,7 +65,7 @@ public:
     return {kArgN,
             kArgXInc, kArgYInc,
             kArgXOffset, kArgYOffset,
-            kArgAlpha};
+            kArgAlpha, kArgBeta};
   }
   static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufMatC}; }
   static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
@@ -135,19 +166,6 @@ public:
 };
 
 // =================================================================================================
-
-template <typename T>
-StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
-  for (auto index = size_t{0}; index < args.n; ++index) {
-    const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset];
-    const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset];
-    const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset];
-    buffers_host.c_mat[index] = x * y * args.alpha + z * args.beta;
-  }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
 } // namespace clblast
 
 // CLBLAST_TEST_ROUTINES_XHAD_H_