diff options
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | src/kernels/level2/xgemv_fast.opencl | 10 | ||||
-rw-r--r-- | src/tuning/kernels/xgemv.cpp | 18 |
3 files changed, 21 insertions, 8 deletions
@@ -7,6 +7,7 @@ Development version (next release) - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels - Added an option (-warm_up) to do a warm-up run before timing in the performance clients +- Improved performance significantly of rotated GEMV computations - Added tuned parameters for various devices (see README) Version 0.8.0 diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 359c3770..210c42c1 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -97,7 +97,7 @@ __kernel void XgemvFast(const int m, const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, - const int kl, const int ku) { + const int kl_unused, const int ku_unused) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); @@ -199,7 +199,7 @@ __kernel void XgemvFastRot(const int m, const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, - const int kl, const int ku) { + const int kl_unused, const int ku_unused) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); @@ -216,11 +216,13 @@ __kernel void XgemvFastRot(const int m, const int n, real acc; SetToZero(acc); - // Loops over work-group sized portions of the work + // Loops over tile-sized portions of the work for (int kwg=0; kwg<n; kwg+=WPT3) { // Loads the vector X into local memory - xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset]; + if (lid < WPT3) { + xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset]; + } // Loads the matrix A into local memory #pragma unroll diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 8446e4a9..96d4a5f2 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -61,10 +61,20 @@ class TuneXgemv { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128}); - if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); } - else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); } - if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } + if (V==1) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + } + if (V==2) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + } + if (V==3) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); + tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + } } // Sets the constraints and local memory size |