summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-07-23 14:52:32 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-07-23 14:52:32 +0200
commit7a4f9637639ce83191bc2d6e8485f9a9dfd949af (patch)
tree6885e17ee3705ac8235309a481b4d248e9542398 /src
parent75fe8235f78520fbbfff7c9c035ecd9f1aa3e6f6 (diff)
Further improvements to the XgemvFastRot kernel, properly enables coalescing now
Diffstat (limited to 'src')
-rw-r--r--src/database/kernels/xgemv.hpp2
-rw-r--r--src/kernels/level2/xgemv_fast.opencl84
-rw-r--r--src/tuning/kernels/xgemv.cpp8
3 files changed, 50 insertions, 44 deletions
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 6fb68858..3aa1863f 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -36,7 +36,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Oland", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 1d34de96..359c3770 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -204,10 +204,10 @@ __kernel void XgemvFastRot(const int m, const int n,
const real beta = GetRealArg(arg_beta);
// Local memory to store a tile of the matrix (for coalescing)
- __local real tile[WGS3 * WPT3];
+ __local real tile[WPT3][WGS3];
const int lid = get_local_id(0);
- const int lid_mod = lid % WPT3;
- const int lid_div = lid / WPT3;
+ const int lid_mod = lid % (WPT3/VW3);
+ const int lid_div = lid / (WPT3/VW3);
// Local memory for the vector X
__local real xlm[WPT3];
@@ -225,45 +225,45 @@ __kernel void XgemvFastRot(const int m, const int n,
// Loads the matrix A into local memory
#pragma unroll
for (int kl=0; kl<WPT3/VW3; ++kl) {
- const int x = (kwg/VW3) + kl;
- const int y = get_group_id(0) * WGS3 + lid;
+ const int x = (kwg/VW3) + lid_mod;
+ const int y = get_group_id(0) * WGS3 + lid_div * (WPT3/VW3) + kl;
realVFR avec = agm[(a_ld/VW3) * y + x];
#if VW3 == 1
- tile[(kl*VW3 + 0) * WGS3 + lid] = avec;
+ tile[kl*VW3 + 0][lid] = avec;
#elif VW3 == 2
- tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x;
- tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y;
+ tile[kl*VW3 + 0][lid] = avec.x;
+ tile[kl*VW3 + 1][lid] = avec.y;
#elif VW3 == 4
- tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x;
- tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y;
- tile[(kl*VW3 + 2) * WGS3 + lid] = avec.z;
- tile[(kl*VW3 + 3) * WGS3 + lid] = avec.w;
+ tile[kl*VW3 + 0][lid] = avec.x;
+ tile[kl*VW3 + 1][lid] = avec.y;
+ tile[kl*VW3 + 2][lid] = avec.z;
+ tile[kl*VW3 + 3][lid] = avec.w;
#elif VW3 == 8
- tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0;
- tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1;
- tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2;
- tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3;
- tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4;
- tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5;
- tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6;
- tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7;
+ tile[kl*VW3 + 0][lid] = avec.s0;
+ tile[kl*VW3 + 1][lid] = avec.s1;
+ tile[kl*VW3 + 2][lid] = avec.s2;
+ tile[kl*VW3 + 3][lid] = avec.s3;
+ tile[kl*VW3 + 4][lid] = avec.s4;
+ tile[kl*VW3 + 5][lid] = avec.s5;
+ tile[kl*VW3 + 6][lid] = avec.s6;
+ tile[kl*VW3 + 7][lid] = avec.s7;
#elif VW3 == 16
- tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0;
- tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1;
- tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2;
- tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3;
- tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4;
- tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5;
- tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6;
- tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7;
- tile[(kl*VW3 + 8) * WGS3 + lid] = avec.s8;
- tile[(kl*VW3 + 9) * WGS3 + lid] = avec.s9;
- tile[(kl*VW3 + 10) * WGS3 + lid] = avec.sA;
- tile[(kl*VW3 + 11) * WGS3 + lid] = avec.sB;
- tile[(kl*VW3 + 12) * WGS3 + lid] = avec.sC;
- tile[(kl*VW3 + 13) * WGS3 + lid] = avec.sD;
- tile[(kl*VW3 + 14) * WGS3 + lid] = avec.sE;
- tile[(kl*VW3 + 15) * WGS3 + lid] = avec.sF;
+ tile[kl*VW3 + 0][lid] = avec.s0;
+ tile[kl*VW3 + 1][lid] = avec.s1;
+ tile[kl*VW3 + 2][lid] = avec.s2;
+ tile[kl*VW3 + 3][lid] = avec.s3;
+ tile[kl*VW3 + 4][lid] = avec.s4;
+ tile[kl*VW3 + 5][lid] = avec.s5;
+ tile[kl*VW3 + 6][lid] = avec.s6;
+ tile[kl*VW3 + 7][lid] = avec.s7;
+ tile[kl*VW3 + 8][lid] = avec.s8;
+ tile[kl*VW3 + 9][lid] = avec.s9;
+ tile[kl*VW3 + 10][lid] = avec.sA;
+ tile[kl*VW3 + 11][lid] = avec.sB;
+ tile[kl*VW3 + 12][lid] = avec.sC;
+ tile[kl*VW3 + 13][lid] = avec.sD;
+ tile[kl*VW3 + 14][lid] = avec.sE;
+ tile[kl*VW3 + 15][lid] = avec.sF;
#endif
}
@@ -272,11 +272,13 @@ __kernel void XgemvFastRot(const int m, const int n,
// The multiply-add function (rotated)
#pragma unroll
- for (int kl=0; kl<WPT3; ++kl) {
- const int k = kl * (WGS3/WPT3) + lid_div;
- real aval = tile[k * WPT3 + lid_mod];
- real xval = xlm[kl];
- MultiplyAdd(acc, xval, aval);
+ for (int kl=0; kl<WPT3/VW3; ++kl) {
+ #pragma unroll
+ for (int v=0; v<VW3; ++v) {
+ real aval = tile[lid_mod*VW3 + v][lid_div * (WPT3/VW3) + kl];
+ real xval = xlm[kl*VW3 + v];
+ MultiplyAdd(acc, xval, aval);
+ }
}
// Synchronizes all threads in a workgroup
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index b69e4352..8446e4a9 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -61,7 +61,7 @@ class TuneXgemv {
// Sets the tuning parameters and their possible values
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
- tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256});
+ tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); }
else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); }
if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
@@ -69,10 +69,14 @@ class TuneXgemv {
// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
- auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
if (V==2 || V==3) {
+ auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
}
+ if (V==3) {
+ auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
+ tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+ }
}
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
if (V==1 || V==2) {