diff options
-rw-r--r-- | include/internal/database/xgemv.h | 2 | ||||
-rw-r--r-- | src/kernels/xgemv.opencl | 235 | ||||
-rw-r--r-- | src/routines/xgemv.cc | 11 | ||||
-rw-r--r-- | src/tuning/tuning.cc | 1 | ||||
-rw-r--r-- | src/tuning/xgemv.cc | 14 |
5 files changed, 209 insertions, 54 deletions
diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h index b47df4d2..37d33487 100644 --- a/include/internal/database/xgemv.h +++ b/include/internal/database/xgemv.h @@ -30,7 +30,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { }, { // Intel GPUs CL_DEVICE_TYPE_GPU, "Intel", { - { "Iris", { {"WGS",256}, {"WPT",2}, {"VW",1} } }, + { "Iris", { {"WGS",128}, {"WPT",4}, {"VW",4} } }, } }, { // Default diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl index 46a5e784..26e7587f 100644 --- a/src/kernels/xgemv.opencl +++ b/src/kernels/xgemv.opencl @@ -26,54 +26,12 @@ R"( #define WPT 1 // The amount of work-per-thread #endif #ifndef VW - #define VW 1 // Vector width of vectors X and Y + #define VW 1 // Vector width of matrix A loads (only for the fast kernel) #endif // ================================================================================================= -// The multiply-add function for the main part (divisable by WGS) -inline void MatrixVectorMain(const __global real* restrict agm, __local real* xlm, real acc[WPT], - const int gid, const int w, const int kwg, - const int a_ld, const int a_offset, const int a_transposed) { - if (a_transposed == 0) { // Not transposed - #pragma unroll - for (int kl=0; kl<WGS; ++kl) { - const int k = kwg + kl; - MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xlm[kl]); - } - } - else { // Transposed - #pragma unroll - for (int kl=0; kl<WGS; ++kl) { - const int k = kwg + kl; - MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xlm[kl]); - } - } -} - -// The multiply-add function for the remainder part (not divisable by WGS) -inline void MatrixVectorRemainder(const __global real* restrict agm, - const __global real* restrict xgm, real acc[WPT], - const int gid, const int w, const int n_floor, const int n, - const int a_ld, const int a_offset, const int a_transposed, - const int x_inc, const int x_offset) { - if (a_transposed == 0) { // Not transposed - #pragma unroll - for (int k=n_floor; k<n; ++k) { - MultiplyAdd(acc[w], agm[gid + a_ld*k + a_offset], xgm[k*x_inc + x_offset]); - } - } - else { // Transposed - #pragma unroll - for (int k=n_floor; k<n; ++k) { - MultiplyAdd(acc[w], agm[k + a_ld*gid + a_offset], xgm[k*x_inc + x_offset]); - } - } -} - -// ================================================================================================= - -// The gemv kernel +// Full version of the kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, const int a_transposed, @@ -110,7 +68,22 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, for (int w=0; w<WPT; ++w) { const int gid = w*get_global_size(0) + get_global_id(0); if (gid < m) { - MatrixVectorMain(agm, xlm, acc, gid, w, kwg, a_ld, a_offset, a_transposed); + + // The multiply-add function for the main part (divisable by WGS) + if (a_transposed == 0) { // Not transposed + #pragma unroll + for (int kl=0; kl<WGS; ++kl) { + const int k = kwg + kl; + MultiplyAdd(acc[w], xlm[kl], agm[gid + a_ld*k + a_offset]); + } + } + else { // Transposed + #pragma unroll + for (int kl=0; kl<WGS; ++kl) { + const int k = kwg + kl; + MultiplyAdd(acc[w], xlm[kl], agm[k + a_ld*gid + a_offset]); + } + } } } @@ -123,8 +96,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, for (int w=0; w<WPT; ++w) { const int gid = w*get_global_size(0) + get_global_id(0); if (gid < m) { - MatrixVectorRemainder(agm, xgm, acc, gid, w, n_floor, n, - a_ld, a_offset, a_transposed, x_inc, x_offset); + + // The multiply-add function for the remainder part (not divisable by WGS) + if (a_transposed == 0) { // Not transposed + #pragma unroll + for (int k=n_floor; k<n; ++k) { + MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], agm[gid + a_ld*k + a_offset]); + } + } + else { // Transposed + #pragma unroll + for (int k=n_floor; k<n; ++k) { + MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], agm[k + a_ld*gid + a_offset]); + } + } // Stores the final result AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]); @@ -134,6 +119,162 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // ================================================================================================= +// Data-widths for the 'fast' kernel +#if VW == 1 + typedef real realV; +#elif VW == 2 + typedef real2 realV; +#elif VW == 4 + typedef real4 realV; +#elif VW == 8 + typedef real8 realV; +#elif VW == 16 + typedef real16 realV; +#endif + +// Faster version of the kernel, assuming that: +// --> 'm' and 'n' are multiples of WGS +// --> 'a_offset' is 0 +// --> 'a_ld' is a multiple of VW +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta, + const int a_transposed, + const __global realV* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + // Local memory for the vector X + __local real xlm[WGS]; + + // Initializes the accumulation register + real acc[WPT]; + #pragma unroll + for (int w=0; w<WPT; ++w) { + SetToZero(acc[w]); + } + + // Loops over work-group sized portions of the work + for (int kwg=0; kwg<n; kwg+=WGS) { + + // Loads the vector X into local memory + const int lid = get_local_id(0); + xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset]; + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + // The multiply-add function (not transposed) + if (a_transposed == 0) { + #pragma unroll + for (int kl=0; kl<WGS; ++kl) { + const int k = kwg + kl; + #pragma unroll + for (int w=0; w<WPT/VW; ++w) { + const int gid = (WPT/VW)*get_global_id(0) + w; + #if VW == 1 + MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k]); + #elif VW == 2 + MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].x); + MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].y); + #elif VW == 4 + MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].x); + MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].y); + MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].z); + MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].w); + #elif VW == 8 + MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].s0); + MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].s1); + MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].s2); + MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].s3); + MultiplyAdd(acc[VW*w+4], xlm[kl], agm[gid + (a_ld/VW)*k].s4); + MultiplyAdd(acc[VW*w+5], xlm[kl], agm[gid + (a_ld/VW)*k].s5); + MultiplyAdd(acc[VW*w+6], xlm[kl], agm[gid + (a_ld/VW)*k].s6); + MultiplyAdd(acc[VW*w+7], xlm[kl], agm[gid + (a_ld/VW)*k].s7); + #elif VW == 16 + MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].s0); + MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].s1); + MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].s2); + MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].s3); + MultiplyAdd(acc[VW*w+4], xlm[kl], agm[gid + (a_ld/VW)*k].s4); + MultiplyAdd(acc[VW*w+5], xlm[kl], agm[gid + (a_ld/VW)*k].s5); + MultiplyAdd(acc[VW*w+6], xlm[kl], agm[gid + (a_ld/VW)*k].s6); + MultiplyAdd(acc[VW*w+7], xlm[kl], agm[gid + (a_ld/VW)*k].s7); + MultiplyAdd(acc[VW*w+8], xlm[kl], agm[gid + (a_ld/VW)*k].s8); + MultiplyAdd(acc[VW*w+9], xlm[kl], agm[gid + (a_ld/VW)*k].s9); + MultiplyAdd(acc[VW*w+10], xlm[kl], agm[gid + (a_ld/VW)*k].sA); + MultiplyAdd(acc[VW*w+11], xlm[kl], agm[gid + (a_ld/VW)*k].sB); + MultiplyAdd(acc[VW*w+12], xlm[kl], agm[gid + (a_ld/VW)*k].sC); + MultiplyAdd(acc[VW*w+13], xlm[kl], agm[gid + (a_ld/VW)*k].sD); + MultiplyAdd(acc[VW*w+14], xlm[kl], agm[gid + (a_ld/VW)*k].sE); + MultiplyAdd(acc[VW*w+15], xlm[kl], agm[gid + (a_ld/VW)*k].sF); + #endif + } + } + } + + // The multiply-add function (transposed) + else { + #pragma unroll + for (int kl=0; kl<WGS/VW; ++kl) { + const int k = (kwg/VW) + kl; + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int gid = WPT*get_global_id(0) + w; + realV avec = agm[k + (a_ld/VW)*gid]; + #if VW == 1 + MultiplyAdd(acc[w], xlm[VW*kl+0], avec); + #elif VW == 2 + MultiplyAdd(acc[w], xlm[VW*kl+0], avec.x); + MultiplyAdd(acc[w], xlm[VW*kl+1], avec.y); + #elif VW == 4 + MultiplyAdd(acc[w], xlm[VW*kl+0], avec.x); + MultiplyAdd(acc[w], xlm[VW*kl+1], avec.y); + MultiplyAdd(acc[w], xlm[VW*kl+2], avec.z); + MultiplyAdd(acc[w], xlm[VW*kl+3], avec.w); + #elif VW == 8 + MultiplyAdd(acc[w], xlm[VW*kl+0], avec.s0); + MultiplyAdd(acc[w], xlm[VW*kl+1], avec.s1); + MultiplyAdd(acc[w], xlm[VW*kl+2], avec.s2); + MultiplyAdd(acc[w], xlm[VW*kl+3], avec.s3); + MultiplyAdd(acc[w], xlm[VW*kl+4], avec.s4); + MultiplyAdd(acc[w], xlm[VW*kl+5], avec.s5); + MultiplyAdd(acc[w], xlm[VW*kl+6], avec.s6); + MultiplyAdd(acc[w], xlm[VW*kl+7], avec.s7); + #elif VW == 16 + MultiplyAdd(acc[w], xlm[VW*kl+0], avec.s0); + MultiplyAdd(acc[w], xlm[VW*kl+1], avec.s1); + MultiplyAdd(acc[w], xlm[VW*kl+2], avec.s2); + MultiplyAdd(acc[w], xlm[VW*kl+3], avec.s3); + MultiplyAdd(acc[w], xlm[VW*kl+4], avec.s4); + MultiplyAdd(acc[w], xlm[VW*kl+5], avec.s5); + MultiplyAdd(acc[w], xlm[VW*kl+6], avec.s6); + MultiplyAdd(acc[w], xlm[VW*kl+7], avec.s7); + MultiplyAdd(acc[w], xlm[VW*kl+8], avec.s8); + MultiplyAdd(acc[w], xlm[VW*kl+9], avec.s9); + MultiplyAdd(acc[w], xlm[VW*kl+10], avec.sA); + MultiplyAdd(acc[w], xlm[VW*kl+11], avec.sB); + MultiplyAdd(acc[w], xlm[VW*kl+12], avec.sC); + MultiplyAdd(acc[w], xlm[VW*kl+13], avec.sD); + MultiplyAdd(acc[w], xlm[VW*kl+14], avec.sE); + MultiplyAdd(acc[w], xlm[VW*kl+15], avec.sF); + #endif + } + } + } + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the final result + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int gid = WPT*get_global_id(0) + w; + AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]); + } +} + +// ================================================================================================= + // End of the C++11 raw string literal )"; diff --git a/src/routines/xgemv.cc b/src/routines/xgemv.cc index 67f9536e..e4707987 100644 --- a/src/routines/xgemv.cc +++ b/src/routines/xgemv.cc @@ -69,10 +69,19 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T)); if (ErrorIn(status)) { return status; } + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (a_offset == 0) && + IsMultiple(m, db_["WGS"]*db_["WPT"]) && + IsMultiple(n, db_["WGS"]) && + IsMultiple(a_ld, db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XgemvFast" : "Xgemv"; + // Retrieves the Xgemv kernel from the compiled binary try { auto program = GetProgramFromCache(); - auto kernel = Kernel(program, "Xgemv"); + auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_real)); diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc index 94333089..d617af88 100644 --- a/src/tuning/tuning.cc +++ b/src/tuning/tuning.cc @@ -87,6 +87,7 @@ void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) { args.n = GetArgument(argc, argv, help, kArgN, size_t{1024}); args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); + args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kColMajor); fprintf(stdout, "%s\n", help.c_str()); // Creates input buffers with random data diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 6037a5a0..e2d54729 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -33,29 +33,33 @@ void XgemvTune(const Arguments<T> &args, std::string kernel_source = #include "../src/kernels/xgemv.opencl" auto sources = common_source + kernel_source; - auto id = tuner.AddKernelFromString(sources, "Xgemv", {args.m}, {1}); + auto id = tuner.AddKernelFromString(sources, "XgemvFast", {args.m}, {1}); tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64}); // Sets the tunable parameters and their possible values tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048}); - tuner.AddParameter(id, "WPT", {1, 2, 4}); - tuner.AddParameter(id, "VW", {1}); + tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "VW", {1, 2, 4, 8}); // Tests for a specific precision tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + // Sets the constraints + auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; + tuner.AddConstraint(id, MultipleOfX, {"WGS", "VW"}); + tuner.AddConstraint(id, MultipleOfX, {"WPT", "VW"}); + // Modifies the thread-sizes (local) based on the parameters tuner.MulLocalSize(id, {"WGS"}); tuner.DivGlobalSize(id, {"WPT"}); - tuner.DivGlobalSize(id, {"VW"}); // Sets the function's arguments tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(args.alpha); tuner.AddArgumentScalar(args.beta); - tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(static_cast<int>(args.layout)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(static_cast<int>(args.m)); |