summaryrefslogtreecommitdiff
path: root/src/kernels/level1/xaxpy.opencl
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-04-14 20:16:10 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-04-14 20:16:10 +0200
commit10205d773e1477fdd634dbc7e224cc71361a9885 (patch)
treeda6ed72c7f530a02a9cae70938fce4b4670066b2 /src/kernels/level1/xaxpy.opencl
parent0da1e380974007f69b827f6b10ef0243249d0c5e (diff)
Added a new Xaxpy kernel in between the regular and fast version in
Diffstat (limited to 'src/kernels/level1/xaxpy.opencl')
-rw-r--r--src/kernels/level1/xaxpy.opencl25
1 files changed, 22 insertions, 3 deletions
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index f44bbce0..d30d4e55 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -36,12 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha,
}
}
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFaster(const int n, const real_arg arg_alpha,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
+ const real alpha = GetRealArg(arg_alpha);
+
+ if (get_global_id(0) < n / (VW)) {
+ #pragma unroll
+ for (int w=0; w<WPT; ++w) {
+ const int id = w*get_global_size(0) + get_global_id(0);
+ realV xvalue = xgm[id];
+ realV yvalue = ygm[id];
+ ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
+ }
+ }
+}
+
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
-void XaxpyFast(const int n, const real_arg arg_alpha,
- const __global realV* restrict xgm,
- __global realV* ygm) {
+void XaxpyFastest(const int n, const real_arg arg_alpha,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll