summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-02-02 21:18:37 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2018-02-02 21:18:37 +0100
commit69ed46c8da69ee18338eca5102ead43410cc01b5 (patch)
tree575c4eef2a3210117a574f25f81662c503ec207d /src/kernels
parentae66782eabc574a507b8cfe2b83f2df23b1a36c1 (diff)
Implemented the XHAD Hadamard product routine
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/level1/xhad.opencl145
1 files changed, 145 insertions, 0 deletions
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
new file mode 100644
index 00000000..3880b7a4
--- /dev/null
+++ b/src/kernels/level1/xhad.opencl
@@ -0,0 +1,145 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xhad kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general,
+// but doesn't support vector data-types. Based on the XAXPY kernels.
+//
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// A vector-vector multiply function. See also level1.opencl for a vector-scalar version
+INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) {
+ #if VW == 1
+ Multiply(cvec, aval, bvec);
+ #elif VW == 2
+ Multiply(cvec.x, aval.x, bvec.x);
+ Multiply(cvec.y, aval.y, bvec.y);
+ #elif VW == 4
+ Multiply(cvec.x, aval.x, bvec.x);
+ Multiply(cvec.y, aval.y, bvec.y);
+ Multiply(cvec.z, aval.z, bvec.z);
+ Multiply(cvec.w, aval.w, bvec.w);
+ #elif VW == 8
+ Multiply(cvec.s0, aval.s0, bvec.s0);
+ Multiply(cvec.s1, aval.s1, bvec.s1);
+ Multiply(cvec.s2, aval.s2, bvec.s2);
+ Multiply(cvec.s3, aval.s3, bvec.s3);
+ Multiply(cvec.s4, aval.s4, bvec.s4);
+ Multiply(cvec.s5, aval.s5, bvec.s5);
+ Multiply(cvec.s6, aval.s6, bvec.s6);
+ Multiply(cvec.s7, aval.s7, bvec.s7);
+ #elif VW == 16
+ Multiply(cvec.s0, aval.s0, bvec.s0);
+ Multiply(cvec.s1, aval.s1, bvec.s1);
+ Multiply(cvec.s2, aval.s2, bvec.s2);
+ Multiply(cvec.s3, aval.s3, bvec.s3);
+ Multiply(cvec.s4, aval.s4, bvec.s4);
+ Multiply(cvec.s5, aval.s5, bvec.s5);
+ Multiply(cvec.s6, aval.s6, bvec.s6);
+ Multiply(cvec.s7, aval.s7, bvec.s7);
+ Multiply(cvec.s8, aval.s8, bvec.s8);
+ Multiply(cvec.s9, aval.s9, bvec.s9);
+ Multiply(cvec.sA, aval.sA, bvec.sA);
+ Multiply(cvec.sB, aval.sB, bvec.sB);
+ Multiply(cvec.sC, aval.sC, bvec.sC);
+ Multiply(cvec.sD, aval.sD, bvec.sD);
+ Multiply(cvec.sE, aval.sE, bvec.sE);
+ Multiply(cvec.sF, aval.sF, bvec.sF);
+ #endif
+ return cvec;
+}
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ const __global real* restrict ygm, const int y_offset, const int y_inc,
+ __global real* zgm, const int z_offset, const int z_inc) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+ for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
+ real xvalue = xgm[id*x_inc + x_offset];
+ real yvalue = ygm[id*y_inc + y_offset];
+ real zvalue = zgm[id*z_inc + z_offset];
+ real result;
+ real alpha_times_x;
+ Multiply(alpha_times_x, alpha, xvalue);
+ Multiply(result, alpha_times_x, yvalue);
+ MultiplyAdd(result, beta, zvalue);
+ zgm[id*z_inc + z_offset] = result;
+ }
+}
+
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realV* restrict xgm, const __global realV* restrict ygm,
+ __global realV* zgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ if (get_global_id(0) < n / (VW)) {
+ #pragma unroll
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id = _w*get_global_size(0) + get_global_id(0);
+ realV xvalue = xgm[id];
+ realV yvalue = ygm[id];
+ realV zvalue = zgm[id];
+ realV result;
+ realV alpha_times_x;
+ alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+ result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+ zgm[id] = MultiplyAddVector(result, beta, zvalue);
+ }
+ }
+}
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realV* restrict xgm, const __global realV* restrict ygm,
+ __global realV* zgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ #pragma unroll
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id = _w*get_global_size(0) + get_global_id(0);
+ realV xvalue = xgm[id];
+ realV yvalue = ygm[id];
+ realV zvalue = zgm[id];
+ realV result;
+ realV alpha_times_x;
+ alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
+ result = MultiplyVectorVector(result, alpha_times_x, yvalue);
+ zgm[id] = MultiplyAddVector(result, beta, zvalue);
+ }
+}
+
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================