Implemented first simple pre-processor: defines parser and loop unrolling based on assumptions

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-25 17:46:01 +0100
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-25 17:46:01 +0100
commit: 69aa3b35ed499b5ba509d25ece97a24b66a456d9 (patch)
tree: 1cba91b8fcfb8a3da977420781d29e190729aa02 /src/kernels/level1
parent: f01bcded1e34e3b031e78cee357d1c1e0f1aa5be (diff)
1 files changed, 4 insertions, 6 deletions
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index d30d4e55..3a574ec2 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -29,8 +29,7 @@ void Xaxpy(const int n, const real_arg arg_alpha,
   const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
-  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+  for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
     real xvalue = xgm[id*x_inc + x_offset];
     MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
   }
@@ -46,7 +45,7 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
 
   if (get_global_id(0) < n / (VW)) {
     #pragma unroll
-    for (int w=0; w<WPT; ++w) {
+    for (int w = 0; w < WPT; w += 1) {
       const int id = w*get_global_size(0) + get_global_id(0);
       realV xvalue = xgm[id];
       realV yvalue = ygm[id];
@@ -64,7 +63,7 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
   const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
+  for (int w = 0; w < WPT; w += 1) {
     const int id = w*get_global_size(0) + get_global_id(0);
     realV xvalue = xgm[id];
     realV yvalue = ygm[id];
@@ -83,8 +82,7 @@ void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
   const real alpha = GetRealArg(arg_alphas[batch]);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
-  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+  for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
     real xvalue = xgm[id*x_inc + x_offsets[batch]];
     MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue);
   }
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-25 17:46:01 +0100
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-25 17:46:01 +0100
commit	69aa3b35ed499b5ba509d25ece97a24b66a456d9 (patch)
tree	1cba91b8fcfb8a3da977420781d29e190729aa02 /src/kernels/level1
parent	f01bcded1e34e3b031e78cee357d1c1e0f1aa5be (diff)