Completed kernel modifications for pre-processor of all other kernels

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-12-09 20:44:21 +0100
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-12-09 20:44:21 +0100
commit: 9f02fb542ca659bf58d1efefdb334ea386ef10e8 (patch)
tree: 8669e215f38e8b00a4ee73a436479207f2afdbf5 /src/kernels/level1
parent: ca5dbcd2bd31fb0a0e3f6c2f81b3c0fff6250738 (diff)
6 files changed, 6 insertions, 15 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 2bd2f714..27add015 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -75,7 +75,6 @@ void Xamax(const int n,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
       if (maxlm[lid + s] >= maxlm[lid]) {
@@ -117,7 +116,6 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
       if (maxlm[lid + s] >= maxlm[lid]) {
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 1fc91be8..29e7fa3e 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -56,7 +56,6 @@ void Xasum(const int n,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
@@ -85,7 +84,6 @@ void XasumEpilogue(const __global real* restrict input,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 228e0735..aed80fc2 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -28,7 +28,6 @@ void Xcopy(const int n,
            __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
     ygm[id*y_inc + y_offset] = xgm[id*x_inc + x_offset];
   }
@@ -43,8 +42,8 @@ void XcopyFast(const int n,
                const __global realV* restrict xgm,
                __global realV* ygm) {
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id = w*get_global_size(0) + get_global_id(0);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
     ygm[id] = xgm[id];
   }
 }
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index f6d869cb..6a81c150 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -54,7 +54,6 @@ void Xnrm2(const int n,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
@@ -83,7 +82,6 @@ void Xnrm2Epilogue(const __global real* restrict input,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 3da9c2fd..cb133e88 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -28,7 +28,6 @@ void Xscal(const int n, const real_arg arg_alpha,
   const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
     real xvalue = xgm[id*x_inc + x_offset];
     real result;
@@ -47,8 +46,8 @@ void XscalFast(const int n, const real_arg arg_alpha,
   const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id = w*get_global_size(0) + get_global_id(0);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
     realV xvalue = xgm[id];
     realV result;
     result = MultiplyVector(result, alpha, xvalue);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index 267271c0..bf5b6194 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -28,7 +28,6 @@ void Xswap(const int n,
            __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
     real temp = xgm[id*x_inc + x_offset];
     xgm[id*x_inc + x_offset] = ygm[id*y_inc + y_offset];
@@ -45,8 +44,8 @@ void XswapFast(const int n,
                __global realV* xgm,
                __global realV* ygm) {
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id = w*get_global_size(0) + get_global_id(0);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
     realV temp = xgm[id];
     xgm[id] = ygm[id];
     ygm[id] = temp;
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-12-09 20:44:21 +0100
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-12-09 20:44:21 +0100
commit	9f02fb542ca659bf58d1efefdb334ea386ef10e8 (patch)
tree	8669e215f38e8b00a4ee73a436479207f2afdbf5 /src/kernels/level1
parent	ca5dbcd2bd31fb0a0e3f6c2f81b3c0fff6250738 (diff)