From 75fe8235f78520fbbfff7c9c035ecd9f1aa3e6f6 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 23 Jul 2016 10:20:11 +0200 Subject: Improved the XgemvFastRot kernel by tiled loading of the input matrix A, enabling better memory performance --- src/database/kernels/xgemv.hpp | 2 +- src/kernels/level2/xgemv_fast.opencl | 137 +++++++++++++++++------------------ src/routines/level2/xgemv.cpp | 2 +- src/tuning/kernels/xgemv.cpp | 20 +++-- 4 files changed, 85 insertions(+), 76 deletions(-) diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 65f4b5c8..6fb68858 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -36,7 +36,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { "Xgemv", Precision::kSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Oland", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 1127a0b6..1d34de96 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -38,7 +38,7 @@ R"( #define WGS3 64 // The local work-group size #endif #ifndef WPT3 - #define WPT3 1 // The amount of work-per-thread + #define WPT3 1 // The tile-size #endif #ifndef VW3 #define VW3 1 // Vector width of matrix A loads @@ -74,18 +74,12 @@ R"( // ================================================================================================= -// Loads a vector input value (1/2) +// Loads a vector input value inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, const int a_ld) { return agm[a_ld*y + x]; } -// Loads a vector input value (2/2): as before, but different data-type -inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y, - const int a_ld) { - return agm[a_ld*y + x]; -} - // ================================================================================================= // Faster version of the kernel, assuming that: @@ -110,7 +104,7 @@ __kernel void XgemvFast(const int m, const int n, // Local memory for the vector X __local real xlm[WGS2]; - // Initializes the accumulation register + // Initializes the accumulation registers real acc[WPT2]; #pragma unroll for (int w=0; w::MatVec(const Layout layout, const Transpose a_transpose, } if (fast_kernel_rot) { kernel_name = "XgemvFastRot"; - global_size = m_real / db_["WPT3"]; + global_size = m_real; local_size = db_["WGS3"]; } diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 5c187d33..b69e4352 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -61,8 +61,9 @@ class TuneXgemv { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256}); + if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); } + else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); } if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } } @@ -74,8 +75,14 @@ class TuneXgemv { } } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); + if (V==1 || V==2) { + auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); + } + else { + auto LocalMemorySize = [args] (std::vector v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); + } } // Sets the base thread configuration @@ -89,7 +96,10 @@ class TuneXgemv { static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } static TransformVector DivLocal() { return {}; } static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; } + static TransformVector DivGlobal() { + if (V==1 || V==2) return {{"WPT"+std::to_string(V)}}; + return {}; + } // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, -- cgit v1.2.3