summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-06-13 14:10:07 +0200
committerCNugteren <web@cedricnugteren.nl>2015-06-13 14:10:07 +0200
commit9b66883e9c016ed749e4e492416ac42b63a4ddd2 (patch)
treeabd1fd196684160ec51f672e39051c263ab30b57 /src
parente522d1a74e6a877f32730da4807f54cf9a996679 (diff)
Improved GEMV kernel with local memory and a tunable WPT
Diffstat (limited to 'src')
-rw-r--r--src/kernels/xgemv.opencl92
-rw-r--r--src/tuning/xgemv.cc6
2 files changed, 82 insertions, 16 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl
index 1d2ab435..de7d5a80 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/xgemv.opencl
@@ -39,24 +39,90 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
- // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+ // Local memory for the vector X
+ __local real xlm[WGS];
+
+ // Initializes the accumulation register
+ real acc[WPT];
#pragma unroll
- for (int id = get_global_id(0); id<m; id += get_global_size(0)) {
-
- // Loop over the elements of the matrix A
- real acc;
- SetToZero(acc);
- if (a_transposed == 0) {
- for (int k=0; k<n; ++k) {
- MultiplyAdd(acc, agm[id + a_ld*k + a_offset], xgm[k*x_inc + x_offset]);
+ for (int w=0; w<WPT; ++w) {
+ SetToZero(acc[w]);
+ }
+
+ // Divides the work in a main and tail section
+ const int n_tail = n % WGS;
+ const int n_floor = n - n_tail;
+
+ // Loops over work-group sized portions of the work
+ for (int kwg=0; kwg<n_floor; kwg+=WGS) {
+
+ // Loads the vector X into local memory
+ const int lid = get_local_id(0);
+ xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+
+ // Synchronizes all threads in a workgroup
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loops over the work per thread
+ #pragma unroll
+ for (int w=0; w<WPT; ++w) {
+ const int gid = w*get_global_size(0) + get_global_id(0);
+
+ // Checks whether this thread is within bounds
+ // Note: placed here because of the synchronisation barriers
+ if (gid < m) {
+
+ // Main multiply-add computation (regular)
+ if (a_transposed == 0) {
+ #pragma unroll
+ for (int kl=0; kl<WGS; ++kl) {
+ const int k = kwg + kl;
+ MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xlm[kl]);
+ }
+ }
+
+ // Main multiply-add computation (transposed)
+ else {
+ #pragma unroll
+ for (int kl=0; kl<WGS; ++kl) {
+ const int k = kwg + kl;
+ MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xlm[kl]);
+ }
+ }
}
}
- else {
- for (int k=0; k<n; ++k) {
- MultiplyAdd(acc, agm[k + a_ld*id + a_offset], xgm[k*x_inc + x_offset]);
+
+ // Synchronizes all threads in a workgroup
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // Loops over the work per thread
+ #pragma unroll
+ for (int w=0; w<WPT; ++w) {
+ const int gid = w*get_global_size(0) + get_global_id(0);
+
+ // Checks whether this thread is within bounds
+ if (gid < m) {
+
+ // Multiply-add computation for the remaining tail (regular)
+ if (a_transposed == 0) {
+ #pragma unroll
+ for (int k=n_floor; k<n; ++k) {
+ MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xgm[k*x_inc + x_offset]);
+ }
+ }
+
+ // Multiply-add computation for the remaining tail (transposed)
+ else {
+ #pragma unroll
+ for (int k=n_floor; k<n; ++k) {
+ MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xgm[k*x_inc + x_offset]);
+ }
}
+
+ // Stores the final result
+ AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]);
}
- AXPBY(ygm[id*y_inc + y_offset], alpha, acc, beta, ygm[id*y_inc + y_offset]);
}
}
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 74bb77a5..6037a5a0 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -37,8 +37,8 @@ void XgemvTune(const Arguments<T> &args,
tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
// Sets the tunable parameters and their possible values
- tuner.AddParameter(id, "WGS", {64, 128});
- tuner.AddParameter(id, "WPT", {1});
+ tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048});
+ tuner.AddParameter(id, "WPT", {1, 2, 4});
tuner.AddParameter(id, "VW", {1});
// Tests for a specific precision
@@ -58,7 +58,7 @@ void XgemvTune(const Arguments<T> &args,
tuner.AddArgumentScalar(0);
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(0);
- tuner.AddArgumentScalar(static_cast<int>(args.n));
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(1);