diff options
-rw-r--r-- | src/database/kernels/xgemv.hpp | 2 | ||||
-rw-r--r-- | src/kernels/level2/xgemv_fast.opencl | 84 | ||||
-rw-r--r-- | src/tuning/kernels/xgemv.cpp | 8 |
3 files changed, 50 insertions, 44 deletions
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 6fb68858..3aa1863f 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -36,7 +36,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { "Xgemv", Precision::kSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Oland", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 1d34de96..359c3770 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -204,10 +204,10 @@ __kernel void XgemvFastRot(const int m, const int n, const real beta = GetRealArg(arg_beta); // Local memory to store a tile of the matrix (for coalescing) - __local real tile[WGS3 * WPT3]; + __local real tile[WPT3][WGS3]; const int lid = get_local_id(0); - const int lid_mod = lid % WPT3; - const int lid_div = lid / WPT3; + const int lid_mod = lid % (WPT3/VW3); + const int lid_div = lid / (WPT3/VW3); // Local memory for the vector X __local real xlm[WPT3]; @@ -225,45 +225,45 @@ __kernel void XgemvFastRot(const int m, const int n, // Loads the matrix A into local memory #pragma unroll for (int kl=0; kl<WPT3/VW3; ++kl) { - const int x = (kwg/VW3) + kl; - const int y = get_group_id(0) * WGS3 + lid; + const int x = (kwg/VW3) + lid_mod; + const int y = get_group_id(0) * WGS3 + lid_div * (WPT3/VW3) + kl; realVFR avec = agm[(a_ld/VW3) * y + x]; #if VW3 == 1 - tile[(kl*VW3 + 0) * WGS3 + lid] = avec; + tile[kl*VW3 + 0][lid] = avec; #elif VW3 == 2 - tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x; - tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y; + tile[kl*VW3 + 0][lid] = avec.x; + tile[kl*VW3 + 1][lid] = avec.y; #elif VW3 == 4 - tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x; - tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y; - tile[(kl*VW3 + 2) * WGS3 + lid] = avec.z; - tile[(kl*VW3 + 3) * WGS3 + lid] = avec.w; + tile[kl*VW3 + 0][lid] = avec.x; + tile[kl*VW3 + 1][lid] = avec.y; + tile[kl*VW3 + 2][lid] = avec.z; + tile[kl*VW3 + 3][lid] = avec.w; #elif VW3 == 8 - tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0; - tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1; - tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2; - tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3; - tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4; - tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5; - tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6; - tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7; + tile[kl*VW3 + 0][lid] = avec.s0; + tile[kl*VW3 + 1][lid] = avec.s1; + tile[kl*VW3 + 2][lid] = avec.s2; + tile[kl*VW3 + 3][lid] = avec.s3; + tile[kl*VW3 + 4][lid] = avec.s4; + tile[kl*VW3 + 5][lid] = avec.s5; + tile[kl*VW3 + 6][lid] = avec.s6; + tile[kl*VW3 + 7][lid] = avec.s7; #elif VW3 == 16 - tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0; - tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1; - tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2; - tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3; - tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4; - tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5; - tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6; - tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7; - tile[(kl*VW3 + 8) * WGS3 + lid] = avec.s8; - tile[(kl*VW3 + 9) * WGS3 + lid] = avec.s9; - tile[(kl*VW3 + 10) * WGS3 + lid] = avec.sA; - tile[(kl*VW3 + 11) * WGS3 + lid] = avec.sB; - tile[(kl*VW3 + 12) * WGS3 + lid] = avec.sC; - tile[(kl*VW3 + 13) * WGS3 + lid] = avec.sD; - tile[(kl*VW3 + 14) * WGS3 + lid] = avec.sE; - tile[(kl*VW3 + 15) * WGS3 + lid] = avec.sF; + tile[kl*VW3 + 0][lid] = avec.s0; + tile[kl*VW3 + 1][lid] = avec.s1; + tile[kl*VW3 + 2][lid] = avec.s2; + tile[kl*VW3 + 3][lid] = avec.s3; + tile[kl*VW3 + 4][lid] = avec.s4; + tile[kl*VW3 + 5][lid] = avec.s5; + tile[kl*VW3 + 6][lid] = avec.s6; + tile[kl*VW3 + 7][lid] = avec.s7; + tile[kl*VW3 + 8][lid] = avec.s8; + tile[kl*VW3 + 9][lid] = avec.s9; + tile[kl*VW3 + 10][lid] = avec.sA; + tile[kl*VW3 + 11][lid] = avec.sB; + tile[kl*VW3 + 12][lid] = avec.sC; + tile[kl*VW3 + 13][lid] = avec.sD; + tile[kl*VW3 + 14][lid] = avec.sE; + tile[kl*VW3 + 15][lid] = avec.sF; #endif } @@ -272,11 +272,13 @@ __kernel void XgemvFastRot(const int m, const int n, // The multiply-add function (rotated) #pragma unroll - for (int kl=0; kl<WPT3; ++kl) { - const int k = kl * (WGS3/WPT3) + lid_div; - real aval = tile[k * WPT3 + lid_mod]; - real xval = xlm[kl]; - MultiplyAdd(acc, xval, aval); + for (int kl=0; kl<WPT3/VW3; ++kl) { + #pragma unroll + for (int v=0; v<VW3; ++v) { + real aval = tile[lid_mod*VW3 + v][lid_div * (WPT3/VW3) + kl]; + real xval = xlm[kl*VW3 + v]; + MultiplyAdd(acc, xval, aval); + } } // Synchronizes all threads in a workgroup diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index b69e4352..8446e4a9 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -61,7 +61,7 @@ class TuneXgemv { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256}); + tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128}); if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); } else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); } if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } @@ -69,10 +69,14 @@ class TuneXgemv { // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &tuner, const size_t id) { - auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; if (V==2 || V==3) { + auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); } + if (V==3) { + auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; }; + tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); + } } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { if (V==1 || V==2) { |