diff options
author | CNugteren <web@cedricnugteren.nl> | 2016-02-06 12:09:21 +0100 |
---|---|---|
committer | CNugteren <web@cedricnugteren.nl> | 2016-02-06 12:09:21 +0100 |
commit | 40346bb3a551f14afa5465d7708d8d31102e475e (patch) | |
tree | 3a2e1f336d2cbeef4c122eb3a7cf6e44e52e4545 /src | |
parent | fbf071ba6299e053f4cf4011168d80bf877f3a07 (diff) |
Reduced unrolling factor in xgemv kernel to reduce compilation times
Diffstat (limited to 'src')
-rw-r--r-- | src/kernels/level2/xgemv.opencl | 33 | ||||
-rw-r--r-- | src/tuning/xgemv.cc | 4 |
2 files changed, 22 insertions, 15 deletions
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl index 8ed0e9e4..908d7d13 100644 --- a/src/kernels/level2/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -27,6 +27,9 @@ R"( #ifndef WPT1 #define WPT1 1 // The amount of work-per-thread #endif +#ifndef UNROLL1 + #define UNROLL1 32 // Unroll factor (must be a divider of WGS1) +#endif // 2: For the fast version #ifndef WGS2 @@ -301,28 +304,31 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, barrier(CLK_LOCAL_MEM_FENCE); // Loops over the work per thread, and checks whether in bounds - #pragma unroll for (int w=0; w<WPT1; ++w) { const int gid = w*get_global_size(0) + get_global_id(0); if (gid < m) { // The multiply-add function for the main part (divisable by WGS1) if (a_rotated == 0) { // Not rotated - #pragma unroll - for (int kloop=0; kloop<WGS1; ++kloop) { - const int k = kwg + kloop; - real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku); - if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - MultiplyAdd(acc[w], xlm[kloop], value); + for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) { + #pragma unroll + for (int kunroll=0; kunroll<UNROLL1; ++kunroll) { + const int k = kwg + kloop + kunroll; + real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku); + if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } + MultiplyAdd(acc[w], xlm[kloop + kunroll], value); + } } } else { // Transposed - #pragma unroll - for (int kloop=0; kloop<WGS1; ++kloop) { - const int k = kwg + kloop; - real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku); - if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - MultiplyAdd(acc[w], xlm[kloop], value); + for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) { + #pragma unroll + for (int kunroll=0; kunroll<UNROLL1; ++kunroll) { + const int k = kwg + kloop + kunroll; + real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku); + if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } + MultiplyAdd(acc[w], xlm[kloop + kunroll], value); + } } } } @@ -563,3 +569,4 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea )" // ================================================================================================= + diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 60c73d76..c3cf9b7f 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -60,8 +60,8 @@ class TuneXgemv { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512, 1024, 1536, 2048}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8}); + tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } } |