diff options
author | CNugteren <web@cedricnugteren.nl> | 2015-06-13 14:10:07 +0200 |
---|---|---|
committer | CNugteren <web@cedricnugteren.nl> | 2015-06-13 14:10:07 +0200 |
commit | 9b66883e9c016ed749e4e492416ac42b63a4ddd2 (patch) | |
tree | abd1fd196684160ec51f672e39051c263ab30b57 /src | |
parent | e522d1a74e6a877f32730da4807f54cf9a996679 (diff) |
Improved GEMV kernel with local memory and a tunable WPT
Diffstat (limited to 'src')
-rw-r--r-- | src/kernels/xgemv.opencl | 92 | ||||
-rw-r--r-- | src/tuning/xgemv.cc | 6 |
2 files changed, 82 insertions, 16 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl index 1d2ab435..de7d5a80 100644 --- a/src/kernels/xgemv.opencl +++ b/src/kernels/xgemv.opencl @@ -39,24 +39,90 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { - // Loops over the work that needs to be done (allows for an arbitrary number of threads) + // Local memory for the vector X + __local real xlm[WGS]; + + // Initializes the accumulation register + real acc[WPT]; #pragma unroll - for (int id = get_global_id(0); id<m; id += get_global_size(0)) { - - // Loop over the elements of the matrix A - real acc; - SetToZero(acc); - if (a_transposed == 0) { - for (int k=0; k<n; ++k) { - MultiplyAdd(acc, agm[id + a_ld*k + a_offset], xgm[k*x_inc + x_offset]); + for (int w=0; w<WPT; ++w) { + SetToZero(acc[w]); + } + + // Divides the work in a main and tail section + const int n_tail = n % WGS; + const int n_floor = n - n_tail; + + // Loops over work-group sized portions of the work + for (int kwg=0; kwg<n_floor; kwg+=WGS) { + + // Loads the vector X into local memory + const int lid = get_local_id(0); + xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset]; + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over the work per thread + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int gid = w*get_global_size(0) + get_global_id(0); + + // Checks whether this thread is within bounds + // Note: placed here because of the synchronisation barriers + if (gid < m) { + + // Main multiply-add computation (regular) + if (a_transposed == 0) { + #pragma unroll + for (int kl=0; kl<WGS; ++kl) { + const int k = kwg + kl; + MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xlm[kl]); + } + } + + // Main multiply-add computation (transposed) + else { + #pragma unroll + for (int kl=0; kl<WGS; ++kl) { + const int k = kwg + kl; + MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xlm[kl]); + } + } } } - else { - for (int k=0; k<n; ++k) { - MultiplyAdd(acc, agm[k + a_ld*id + a_offset], xgm[k*x_inc + x_offset]); + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loops over the work per thread + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int gid = w*get_global_size(0) + get_global_id(0); + + // Checks whether this thread is within bounds + if (gid < m) { + + // Multiply-add computation for the remaining tail (regular) + if (a_transposed == 0) { + #pragma unroll + for (int k=n_floor; k<n; ++k) { + MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xgm[k*x_inc + x_offset]); + } + } + + // Multiply-add computation for the remaining tail (transposed) + else { + #pragma unroll + for (int k=n_floor; k<n; ++k) { + MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xgm[k*x_inc + x_offset]); + } } + + // Stores the final result + AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]); } - AXPBY(ygm[id*y_inc + y_offset], alpha, acc, beta, ygm[id*y_inc + y_offset]); } } diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 74bb77a5..6037a5a0 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -37,8 +37,8 @@ void XgemvTune(const Arguments<T> &args, tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64}); // Sets the tunable parameters and their possible values - tuner.AddParameter(id, "WGS", {64, 128}); - tuner.AddParameter(id, "WPT", {1}); + tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048}); + tuner.AddParameter(id, "WPT", {1, 2, 4}); tuner.AddParameter(id, "VW", {1}); // Tests for a specific precision @@ -58,7 +58,7 @@ void XgemvTune(const Arguments<T> &args, tuner.AddArgumentScalar(0); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentInput(x_vec); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(1); |