From 7a4f9637639ce83191bc2d6e8485f9a9dfd949af Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 23 Jul 2016 14:52:32 +0200 Subject: Further improvements to the XgemvFastRot kernel, properly enables coalescing now --- src/database/kernels/xgemv.hpp | 2 +- src/kernels/level2/xgemv_fast.opencl | 84 ++++++++++++++++++------------------ src/tuning/kernels/xgemv.cpp | 8 +++- 3 files changed, 50 insertions(+), 44 deletions(-) (limited to 'src') diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 6fb68858..3aa1863f 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -36,7 +36,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { "Xgemv", Precision::kSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",32}, {"WPT3",32} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Oland", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 1d34de96..359c3770 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -204,10 +204,10 @@ __kernel void XgemvFastRot(const int m, const int n, const real beta = GetRealArg(arg_beta); // Local memory to store a tile of the matrix (for coalescing) - __local real tile[WGS3 * WPT3]; + __local real tile[WPT3][WGS3]; const int lid = get_local_id(0); - const int lid_mod = lid % WPT3; - const int lid_div = lid / WPT3; + const int lid_mod = lid % (WPT3/VW3); + const int lid_div = lid / (WPT3/VW3); // Local memory for the vector X __local real xlm[WPT3]; @@ -225,45 +225,45 @@ __kernel void XgemvFastRot(const int m, const int n, // Loads the matrix A into local memory #pragma unroll for (int kl=0; kl v) { return IsMultiple(v[0], v[1]); }; if (V==2 || V==3) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); } + if (V==3) { + auto LargerOrEqual = [] (std::vector v) { return v[0] >= v[1]; }; + tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); + } } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { if (V==1 || V==2) { -- cgit v1.2.3