diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-04-14 20:16:10 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-04-14 20:16:10 +0200 |
commit | 10205d773e1477fdd634dbc7e224cc71361a9885 (patch) | |
tree | da6ed72c7f530a02a9cae70938fce4b4670066b2 /src/kernels/level1 | |
parent | 0da1e380974007f69b827f6b10ef0243249d0c5e (diff) |
Added a new Xaxpy kernel in between the regular and fast version in
Diffstat (limited to 'src/kernels/level1')
-rw-r--r-- | src/kernels/level1/xaxpy.opencl | 25 |
1 files changed, 22 insertions, 3 deletions
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index f44bbce0..d30d4e55 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -36,12 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha, } } +// Faster version of the kernel without offsets and strided accesses but with if-statement. Also +// assumes that 'n' is dividable by 'VW' and 'WPT'. +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyFaster(const int n, const real_arg arg_alpha, + const __global realV* restrict xgm, + __global realV* ygm) { + const real alpha = GetRealArg(arg_alpha); + + if (get_global_id(0) < n / (VW)) { + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id = w*get_global_size(0) + get_global_id(0); + realV xvalue = xgm[id]; + realV yvalue = ygm[id]; + ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue); + } + } +} + // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) -void XaxpyFast(const int n, const real_arg arg_alpha, - const __global realV* restrict xgm, - __global realV* ygm) { +void XaxpyFastest(const int n, const real_arg arg_alpha, + const __global realV* restrict xgm, + __global realV* ygm) { const real alpha = GetRealArg(arg_alpha); #pragma unroll |