summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG4
-rw-r--r--README.md1
-rw-r--r--ROADMAP.md2
-rw-r--r--src/kernels/level1/xhad.opencl145
-rw-r--r--src/routines/levelx/xhad.cpp58
-rw-r--r--test/routines/levelx/xhad.hpp48
6 files changed, 241 insertions, 17 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 8208b77a..13b20d9d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,8 @@
+Development (next version)
+- Added non-BLAS level-1 routines:
+ * SHAD/DHAD/CHAD/ZHAD/HHAD (Hadamard element-wise vector-vector product)
+
Version 1.3.0
- Re-designed and integrated the auto-tuner, no more dependency on CLTune
- Made it possible to override the tuning parameters in the clients straight from JSON tuning files
diff --git a/README.md b/README.md
index 4fc6044d..f05889bb 100644
--- a/README.md
+++ b/README.md
@@ -319,6 +319,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
| IxAMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
| IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ |
| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHAD | ✔ | ✔ | ✔ | ✔ | ✔ | (Hadamard product)
| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ |
| xIM2COL | ✔ | ✔ | ✔ | ✔ | ✔ |
diff --git a/ROADMAP.md b/ROADMAP.md
index 33775034..df42f75c 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -13,7 +13,7 @@ This file gives an overview of the main features planned for addition to CLBlast
| [#207](https://github.com/CNugteren/CLBlast/issues/207) | Dec '17 | CNugteren | ✔ | Tuning of the TRSM/TRSV routines |
| [#195](https://github.com/CNugteren/CLBlast/issues/195) | Jan '18 | CNugteren | ✔ | Extra GEMM API with pre-allocated temporary buffer |
| [#95](https://github.com/CNugteren/CLBlast/issues/95) & #237 | Jan '18 | CNugteren | ✔ | Implement strided batch GEMM |
-| [#224](https://github.com/CNugteren/CLBlast/issues/224) | Jan-Feb '18 | CNugteren | | Implement Hadamard product (element-wise vector-vector product) |
+| [#224](https://github.com/CNugteren/CLBlast/issues/224) | Jan-Feb '18 | CNugteren | ✔ | Implement Hadamard product (element-wise vector-vector product) |
| [#233](https://github.com/CNugteren/CLBlast/issues/233) | Feb '18 | CNugteren | | Add CLBlast to common package managers |
| [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | | Python OpenCL interface |
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection |
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
new file mode 100644
index 00000000..3880b7a4
--- /dev/null
+++ b/src/kernels/level1/xhad.opencl
@@ -0,0 +1,145 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xhad kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general,
+// but doesn't support vector data-types. Based on the XAXPY kernels.
+//
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// A vector-vector multiply function. See also level1.opencl for a vector-scalar version
+INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) {
+ #if VW == 1
+ Multiply(cvec, aval, bvec);
+ #elif VW == 2
+ Multiply(cvec.x, aval.x, bvec.x);
+ Multiply(cvec.y, aval.y, bvec.y);
+ #elif VW == 4
+ Multiply(cvec.x, aval.x, bvec.x);
+ Multiply(cvec.y, aval.y, bvec.y);
+ Multiply(cvec.z, aval.z, bvec.z);
+ Multiply(cvec.w, aval.w, bvec.w);
+ #elif VW == 8
+ Multiply(cvec.s0, aval.s0, bvec.s0);
+ Multiply(cvec.s1, aval.s1, bvec.s1);
+ Multiply(cvec.s2, aval.s2, bvec.s2);
+ Multiply(cvec.s3, aval.s3, bvec.s3);
+ Multiply(cvec.s4, aval.s4, bvec.s4);
+ Multiply(cvec.s5, aval.s5, bvec.s5);
+ Multiply(cvec.s6, aval.s6, bvec.s6);
+ Multiply(cvec.s7, aval.s7, bvec.s7);
+ #elif VW == 16
+ Multiply(cvec.s0, aval.s0, bvec.s0);
+ Multiply(cvec.s1, aval.s1, bvec.s1);
+ Multiply(cvec.s2, aval.s2, bvec.s2);
+ Multiply(cvec.s3, aval.s3, bvec.s3);
+ Multiply(cvec.s4, aval.s4, bvec.s4);
+ Multiply(cvec.s5, aval.s5, bvec.s5);
+ Multiply(cvec.s6, aval.s6, bvec.s6);
+ Multiply(cvec.s7, aval.s7, bvec.s7);
+ Multiply(cvec.s8, aval.s8, bvec.s8);
+ Multiply(cvec.s9, aval.s9, bvec.s9);
+ Multiply(cvec.sA, aval.sA, bvec.sA);
+ Multiply(cvec.sB, aval.sB, bvec.sB);
+ Multiply(cvec.sC, aval.sC, bvec.sC);
+ Multiply(cvec.sD, aval.sD, bvec.sD);
+ Multiply(cvec.sE, aval.sE, bvec.sE);
+ Multiply(cvec.sF, aval.sF, bvec.sF);
+ #endif
+ return cvec;
+}
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ const __global real* restrict ygm, const int y_offset, const int y_inc,
+ __global real* zgm, const int z_offset, const int z_inc) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+ for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
+ real xvalue = xgm[id*x_inc + x_offset];
+ real yvalue = ygm[id*y_inc + y_offset];
+ real zvalue = zgm[id*z_inc + z_offset];
+ real result;
+ real alpha_times_x;
+ Multiply(alpha_times_x, alpha, xvalue);
+ Multiply(result, alpha_times_x, yvalue);
+ MultiplyAdd(result, beta, zvalue);
+ zgm[id*z_inc + z_offset] = result;
+ }
+}
+
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realV* restrict xgm, const __global realV* restrict ygm,
+ __global realV* zgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ if (get_global_id(0) < n / (VW)) {
+ #pragma unroll
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id = _w*get_global_size(0) + get_global_id(0);
+ realV xvalue = xgm[id];
+ realV yvalue = ygm[id];
+ realV zvalue = zgm[id];
+ realV result;
+ realV alpha_times_x;
+ alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+ result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+ zgm[id] = MultiplyAddVector(result, beta, zvalue);
+ }
+ }
+}
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realV* restrict xgm, const __global realV* restrict ygm,
+ __global realV* zgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ #pragma unroll
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id = _w*get_global_size(0) + get_global_id(0);
+ realV xvalue = xgm[id];
+ realV yvalue = ygm[id];
+ realV zvalue = zgm[id];
+ realV result;
+ realV alpha_times_x;
+ alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+ result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+ zgm[id] = MultiplyAddVector(result, beta, zvalue);
+ }
+}
+
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/levelx/xhad.cpp b/src/routines/levelx/xhad.cpp
index 46ae8031..da416cc7 100644
--- a/src/routines/levelx/xhad.cpp
+++ b/src/routines/levelx/xhad.cpp
@@ -24,7 +24,7 @@ template <typename T>
Xhad<T>::Xhad(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
-#include "../../kernels/level1/xaxpy.opencl"
+#include "../../kernels/level1/xhad.opencl"
}) {
}
@@ -45,6 +45,62 @@ void Xhad<T>::DoHad(const size_t n, const T alpha,
TestVectorY(n, y_buffer, y_offset, y_inc);
TestVectorY(n, z_buffer, z_offset, z_inc); // TODO: Make a TestVectorZ function with error codes
+ // Determines whether or not the fast-version can be used
+ const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+ (y_offset == 0) && (y_inc == 1) &&
+ (z_offset == 0) && (z_inc == 1) &&
+ IsMultiple(n, db_["WPT"]*db_["VW"]);
+ const auto use_fastest_kernel = use_faster_kernel &&
+ IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+
+ // If possible, run the fast-version of the kernel
+ const auto kernel_name = (use_fastest_kernel) ? "XhadFastest" :
+ (use_faster_kernel) ? "XhadFaster" : "Xhad";
+
+ // Retrieves the Xhad kernel from the compiled binary
+ auto kernel = Kernel(program_, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_faster_kernel || use_fastest_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, GetRealArg(beta));
+ kernel.SetArgument(3, x_buffer());
+ kernel.SetArgument(4, y_buffer());
+ kernel.SetArgument(5, z_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, GetRealArg(beta));
+ kernel.SetArgument(3, x_buffer());
+ kernel.SetArgument(4, static_cast<int>(x_offset));
+ kernel.SetArgument(5, static_cast<int>(x_inc));
+ kernel.SetArgument(6, y_buffer());
+ kernel.SetArgument(7, static_cast<int>(y_offset));
+ kernel.SetArgument(8, static_cast<int>(y_inc));
+ kernel.SetArgument(9, z_buffer());
+ kernel.SetArgument(10, static_cast<int>(z_offset));
+ kernel.SetArgument(11, static_cast<int>(z_inc));
+ }
+
+ // Launches the kernel
+ if (use_fastest_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else if (use_faster_kernel) {
+ auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/test/routines/levelx/xhad.hpp b/test/routines/levelx/xhad.hpp
index fc47a7d6..3e40de87 100644
--- a/test/routines/levelx/xhad.hpp
+++ b/test/routines/levelx/xhad.hpp
@@ -21,12 +21,43 @@
namespace clblast {
// =================================================================================================
+template <typename T>
+StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
+ for (auto index = size_t{0}; index < args.n; ++index) {
+ const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset];
+ const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset];
+ const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset];
+ buffers_host.c_mat[index] = args.alpha * x * y + args.beta * z;
+ }
+ return StatusCode::kSuccess;
+}
+
+// Half-precision version calling the above reference implementation after conversions
+template <>
+StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) {
+ auto x_buffer2 = HalfToFloatBuffer(buffers_host.x_vec);
+ auto y_buffer2 = HalfToFloatBuffer(buffers_host.y_vec);
+ auto c_buffer2 = HalfToFloatBuffer(buffers_host.c_mat);
+ auto dummy = std::vector<float>(0);
+ auto buffers2 = BuffersHost<float>{x_buffer2, y_buffer2, dummy, dummy, c_buffer2, dummy, dummy};
+ auto args2 = Arguments<float>();
+ args2.x_size = args.x_size; args2.y_size = args.y_size; args2.c_size = args.c_size;
+ args2.x_inc = args.x_inc; args2.y_inc = args.y_inc; args2.n = args.n;
+ args2.x_offset = args.x_offset; args2.y_offset = args.y_offset;
+ args2.alpha = HalfToFloat(args.alpha); args2.beta = HalfToFloat(args.beta);
+ auto status = RunReference(args2, buffers2);
+ FloatToHalfBuffer(buffers_host.c_mat, buffers2.c_mat);
+ return status;
+}
+
+// =================================================================================================
+
// See comment at top of file for a description of the class
template <typename T>
class TestXhad {
public:
- // The BLAS level: 4 for the extra routines
+ // The BLAS level: 4 for the extra routines (note: tested with matrix-size values for 'n')
static size_t BLASLevel() { return 4; }
// The list of arguments relevant for this routine
@@ -34,7 +65,7 @@ public:
return {kArgN,
kArgXInc, kArgYInc,
kArgXOffset, kArgYOffset,
- kArgAlpha};
+ kArgAlpha, kArgBeta};
}
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufMatC}; }
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
@@ -135,19 +166,6 @@ public:
};
// =================================================================================================
-
-template <typename T>
-StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
- for (auto index = size_t{0}; index < args.n; ++index) {
- const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset];
- const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset];
- const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset];
- buffers_host.c_mat[index] = x * y * args.alpha + z * args.beta;
- }
- return StatusCode::kSuccess;
-}
-
-// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XHAD_H_