summaryrefslogtreecommitdiff
path: root/src/kernels/level1
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-11-25 17:46:01 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-11-25 17:46:01 +0100
commit69aa3b35ed499b5ba509d25ece97a24b66a456d9 (patch)
tree1cba91b8fcfb8a3da977420781d29e190729aa02 /src/kernels/level1
parentf01bcded1e34e3b031e78cee357d1c1e0f1aa5be (diff)
Implemented first simple pre-processor: defines parser and loop unrolling based on assumptions
Diffstat (limited to 'src/kernels/level1')
-rw-r--r--src/kernels/level1/xaxpy.opencl10
1 files changed, 4 insertions, 6 deletions
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index d30d4e55..3a574ec2 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -29,8 +29,7 @@ void Xaxpy(const int n, const real_arg arg_alpha,
const real alpha = GetRealArg(arg_alpha);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
- #pragma unroll
- for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+ for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
real xvalue = xgm[id*x_inc + x_offset];
MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
}
@@ -46,7 +45,7 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
if (get_global_id(0) < n / (VW)) {
#pragma unroll
- for (int w=0; w<WPT; ++w) {
+ for (int w = 0; w < WPT; w += 1) {
const int id = w*get_global_size(0) + get_global_id(0);
realV xvalue = xgm[id];
realV yvalue = ygm[id];
@@ -64,7 +63,7 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
- for (int w=0; w<WPT; ++w) {
+ for (int w = 0; w < WPT; w += 1) {
const int id = w*get_global_size(0) + get_global_id(0);
realV xvalue = xgm[id];
realV yvalue = ygm[id];
@@ -83,8 +82,7 @@ void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
const real alpha = GetRealArg(arg_alphas[batch]);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
- #pragma unroll
- for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+ for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
real xvalue = xgm[id*x_inc + x_offsets[batch]];
MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue);
}