Improved GEMV kernel with local memory and a tunable WPT

author: CNugteren <web@cedricnugteren.nl> 2015-06-13 14:10:07 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-06-13 14:10:07 +0200
commit: 9b66883e9c016ed749e4e492416ac42b63a4ddd2 (patch)
tree: abd1fd196684160ec51f672e39051c263ab30b57 /src
parent: e522d1a74e6a877f32730da4807f54cf9a996679 (diff)
2 files changed, 82 insertions, 16 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl
index 1d2ab435..de7d5a80 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/xgemv.opencl
@@ -39,24 +39,90 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc) {
 
-  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  // Local memory for the vector X
+  __local real xlm[WGS];
+
+  // Initializes the accumulation register
+  real acc[WPT];
   #pragma unroll
-  for (int id = get_global_id(0); id<m; id += get_global_size(0)) {
-
-    // Loop over the elements of the matrix A
-    real acc;
-    SetToZero(acc);
-    if (a_transposed == 0) {
-      for (int k=0; k<n; ++k) {
-        MultiplyAdd(acc, agm[id + a_ld*k + a_offset], xgm[k*x_inc + x_offset]);
+  for (int w=0; w<WPT; ++w) {
+    SetToZero(acc[w]);
+  }
+
+  // Divides the work in a main and tail section
+  const int n_tail = n % WGS;
+  const int n_floor = n - n_tail;
+
+  // Loops over work-group sized portions of the work
+  for (int kwg=0; kwg<n_floor; kwg+=WGS) {
+
+    // Loads the vector X into local memory
+    const int lid = get_local_id(0);
+    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+
+    // Synchronizes all threads in a workgroup
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Loops over the work per thread
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int gid = w*get_global_size(0) + get_global_id(0);
+
+      // Checks whether this thread is within bounds
+      // Note: placed here because of the synchronisation barriers
+      if (gid < m) {
+
+        // Main multiply-add computation (regular)
+        if (a_transposed == 0) {
+          #pragma unroll
+          for (int kl=0; kl<WGS; ++kl) {
+            const int k = kwg + kl;
+            MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xlm[kl]);
+          }
+        }
+
+        // Main multiply-add computation (transposed)
+        else {
+          #pragma unroll
+          for (int kl=0; kl<WGS; ++kl) {
+            const int k = kwg + kl;
+            MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xlm[kl]);
+          }
+        }
       }
     }
-    else {
-      for (int k=0; k<n; ++k) {
-        MultiplyAdd(acc, agm[k + a_ld*id + a_offset], xgm[k*x_inc + x_offset]);
+
+    // Synchronizes all threads in a workgroup
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Loops over the work per thread
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int gid = w*get_global_size(0) + get_global_id(0);
+
+    // Checks whether this thread is within bounds
+    if (gid < m) {
+
+      // Multiply-add computation for the remaining tail (regular)
+      if (a_transposed == 0) {
+        #pragma unroll
+        for (int k=n_floor; k<n; ++k) {
+          MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xgm[k*x_inc + x_offset]);
+        }
+      }
+
+      // Multiply-add computation for the remaining tail (transposed)
+      else {
+        #pragma unroll
+        for (int k=n_floor; k<n; ++k) {
+          MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xgm[k*x_inc + x_offset]);
+        }
       }
+
+      // Stores the final result
+      AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]);
     }
-    AXPBY(ygm[id*y_inc + y_offset], alpha, acc, beta, ygm[id*y_inc + y_offset]);
   }
 }
 
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 74bb77a5..6037a5a0 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -37,8 +37,8 @@ void XgemvTune(const Arguments<T> &args,
   tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
 
   // Sets the tunable parameters and their possible values
-  tuner.AddParameter(id, "WGS", {64, 128});
-  tuner.AddParameter(id, "WPT", {1});
+  tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048});
+  tuner.AddParameter(id, "WPT", {1, 2, 4});
   tuner.AddParameter(id, "VW", {1});
 
   // Tests for a specific precision
@@ -58,7 +58,7 @@ void XgemvTune(const Arguments<T> &args,
   tuner.AddArgumentScalar(0);
   tuner.AddArgumentInput(a_mat);
   tuner.AddArgumentScalar(0);
-  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
   tuner.AddArgumentInput(x_vec);
   tuner.AddArgumentScalar(0);
   tuner.AddArgumentScalar(1);
author	CNugteren <web@cedricnugteren.nl>	2015-06-13 14:10:07 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-06-13 14:10:07 +0200
commit	9b66883e9c016ed749e4e492416ac42b63a4ddd2 (patch)
tree	abd1fd196684160ec51f672e39051c263ab30b57 /src
parent	e522d1a74e6a877f32730da4807f54cf9a996679 (diff)