Fixe a bug in the new XgemvFastRot kernel related to local memory size

author: Cedric Nugteren <web@cedricnugteren.nl> 2016-07-23 16:58:11 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2016-07-23 16:58:11 +0200
commit: 40a72259eba491631d8875aae465c5a93d7fed02 (patch)
tree: a192d68c4c7331334721da58401a88a21fa9a88b
parent: 7a4f9637639ce83191bc2d6e8485f9a9dfd949af (diff)
3 files changed, 21 insertions, 8 deletions
diff --git a/CHANGELOG b/CHANGELOG
index b6e09102..d018e211 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,7 @@ Development version (next release)
 - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
 - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
+- Improved performance significantly of rotated GEMV computations
 - Added tuned parameters for various devices (see README)
 
 Version 0.8.0
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 359c3770..210c42c1 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -97,7 +97,7 @@ __kernel void XgemvFast(const int m, const int n,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
                         const int do_conjugate, const int parameter,
-                        const int kl, const int ku) {
+                        const int kl_unused, const int ku_unused) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
@@ -199,7 +199,7 @@ __kernel void XgemvFastRot(const int m, const int n,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
                            const int do_conjugate, const int parameter,
-                           const int kl, const int ku) {
+                           const int kl_unused, const int ku_unused) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
@@ -216,11 +216,13 @@ __kernel void XgemvFastRot(const int m, const int n,
   real acc;
   SetToZero(acc);
 
-  // Loops over work-group sized portions of the work
+  // Loops over tile-sized portions of the work
   for (int kwg=0; kwg<n; kwg+=WPT3) {
 
     // Loads the vector X into local memory
-    xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
+    if (lid < WPT3) {
+      xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
+    }
 
     // Loads the matrix A into local memory
     #pragma unroll
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 8446e4a9..96d4a5f2 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -61,10 +61,20 @@ class TuneXgemv {
 
   // Sets the tuning parameters and their possible values
   static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
-    if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); }
-    else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); }
-    if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
+    if (V==1) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
+    }
+    if (V==2) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
+      tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
+    }
+    if (V==3) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32});
+      tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
+    }
   }
 
   // Sets the constraints and local memory size
author	Cedric Nugteren <web@cedricnugteren.nl>	2016-07-23 16:58:11 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2016-07-23 16:58:11 +0200
commit	40a72259eba491631d8875aae465c5a93d7fed02 (patch)
tree	a192d68c4c7331334721da58401a88a21fa9a88b
parent	7a4f9637639ce83191bc2d6e8485f9a9dfd949af (diff)