diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-02-02 21:18:37 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2018-02-02 21:18:37 +0100 |
commit | 69ed46c8da69ee18338eca5102ead43410cc01b5 (patch) | |
tree | 575c4eef2a3210117a574f25f81662c503ec207d /src/kernels | |
parent | ae66782eabc574a507b8cfe2b83f2df23b1a36c1 (diff) |
Implemented the XHAD Hadamard product routine
Diffstat (limited to 'src/kernels')
-rw-r--r-- | src/kernels/level1/xhad.opencl | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl new file mode 100644 index 00000000..3880b7a4 --- /dev/null +++ b/src/kernels/level1/xhad.opencl @@ -0,0 +1,145 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the Xhad kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general, +// but doesn't support vector data-types. Based on the XAXPY kernels. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// A vector-vector multiply function. See also level1.opencl for a vector-scalar version +INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) { + #if VW == 1 + Multiply(cvec, aval, bvec); + #elif VW == 2 + Multiply(cvec.x, aval.x, bvec.x); + Multiply(cvec.y, aval.y, bvec.y); + #elif VW == 4 + Multiply(cvec.x, aval.x, bvec.x); + Multiply(cvec.y, aval.y, bvec.y); + Multiply(cvec.z, aval.z, bvec.z); + Multiply(cvec.w, aval.w, bvec.w); + #elif VW == 8 + Multiply(cvec.s0, aval.s0, bvec.s0); + Multiply(cvec.s1, aval.s1, bvec.s1); + Multiply(cvec.s2, aval.s2, bvec.s2); + Multiply(cvec.s3, aval.s3, bvec.s3); + Multiply(cvec.s4, aval.s4, bvec.s4); + Multiply(cvec.s5, aval.s5, bvec.s5); + Multiply(cvec.s6, aval.s6, bvec.s6); + Multiply(cvec.s7, aval.s7, bvec.s7); + #elif VW == 16 + Multiply(cvec.s0, aval.s0, bvec.s0); + Multiply(cvec.s1, aval.s1, bvec.s1); + Multiply(cvec.s2, aval.s2, bvec.s2); + Multiply(cvec.s3, aval.s3, bvec.s3); + Multiply(cvec.s4, aval.s4, bvec.s4); + Multiply(cvec.s5, aval.s5, bvec.s5); + Multiply(cvec.s6, aval.s6, bvec.s6); + Multiply(cvec.s7, aval.s7, bvec.s7); + Multiply(cvec.s8, aval.s8, bvec.s8); + Multiply(cvec.s9, aval.s9, bvec.s9); + Multiply(cvec.sA, aval.sA, bvec.sA); + Multiply(cvec.sB, aval.sB, bvec.sB); + Multiply(cvec.sC, aval.sC, bvec.sC); + Multiply(cvec.sD, aval.sD, bvec.sD); + Multiply(cvec.sE, aval.sE, bvec.sE); + Multiply(cvec.sF, aval.sF, bvec.sF); + #endif + return cvec; +} + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, + const __global real* restrict xgm, const int x_offset, const int x_inc, + const __global real* restrict ygm, const int y_offset, const int y_inc, + __global real* zgm, const int z_offset, const int z_inc) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + for (int id = get_global_id(0); id < n; id += get_global_size(0)) { + real xvalue = xgm[id*x_inc + x_offset]; + real yvalue = ygm[id*y_inc + y_offset]; + real zvalue = zgm[id*z_inc + z_offset]; + real result; + real alpha_times_x; + Multiply(alpha_times_x, alpha, xvalue); + Multiply(result, alpha_times_x, yvalue); + MultiplyAdd(result, beta, zvalue); + zgm[id*z_inc + z_offset] = result; + } +} + +// Faster version of the kernel without offsets and strided accesses but with if-statement. Also +// assumes that 'n' is dividable by 'VW' and 'WPT'. +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, + const __global realV* restrict xgm, const __global realV* restrict ygm, + __global realV* zgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + if (get_global_id(0) < n / (VW)) { + #pragma unroll + for (int _w = 0; _w < WPT; _w += 1) { + const int id = _w*get_global_size(0) + get_global_id(0); + realV xvalue = xgm[id]; + realV yvalue = ygm[id]; + realV zvalue = zgm[id]; + realV result; + realV alpha_times_x; + alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue); + result = MultiplyVectorVector(result, alpha_times_x, yvalue); + zgm[id] = MultiplyAddVector(result, beta, zvalue); + } + } +} + +// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is +// dividable by 'VW', 'WGS' and 'WPT'. +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta, + const __global realV* restrict xgm, const __global realV* restrict ygm, + __global realV* zgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + #pragma unroll + for (int _w = 0; _w < WPT; _w += 1) { + const int id = _w*get_global_size(0) + get_global_id(0); + realV xvalue = xgm[id]; + realV yvalue = ygm[id]; + realV zvalue = zgm[id]; + realV result; + realV alpha_times_x; + alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue); + result = MultiplyVectorVector(result, alpha_times_x, yvalue); + zgm[id] = MultiplyAddVector(result, beta, zvalue); + } +} + + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= |