diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/kernels/xgemv.opencl | 342 | ||||
-rw-r--r-- | src/routines/xgemv.cc | 36 | ||||
-rw-r--r-- | src/tuning/tuning.cc | 54 | ||||
-rw-r--r-- | src/tuning/xgemv.cc | 62 |
4 files changed, 306 insertions, 188 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl index 26e7587f..b1b2fc69 100644 --- a/src/kernels/xgemv.opencl +++ b/src/kernels/xgemv.opencl @@ -19,42 +19,63 @@ R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. -#ifndef WGS - #define WGS 64 // The local work-group size + +// 1: For the full version of the kernel +#ifndef WGS1 + #define WGS1 64 // The local work-group size +#endif +#ifndef WPT1 + #define WPT1 1 // The amount of work-per-thread +#endif + +// 2: For the fast version +#ifndef WGS2 + #define WGS2 64 // The local work-group size +#endif +#ifndef WPT2 + #define WPT2 1 // The amount of work-per-thread +#endif +#ifndef VW2 + #define VW2 1 // Vector width of matrix A loads #endif -#ifndef WPT - #define WPT 1 // The amount of work-per-thread + +// 3: For the fast rotated version +#ifndef WGS3 + #define WGS3 64 // The local work-group size +#endif +#ifndef WPT3 + #define WPT3 1 // The amount of work-per-thread #endif -#ifndef VW - #define VW 1 // Vector width of matrix A loads (only for the fast kernel) +#ifndef VW3 + #define VW3 1 // Vector width of matrix A loads #endif // ================================================================================================= // Full version of the kernel -__attribute__((reqd_work_group_size(WGS, 1, 1))) +__attribute__((reqd_work_group_size(WGS1, 1, 1))) __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, - const int a_transposed, + const int a_rotated, const __global real* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { // Local memory for the vector X - __local real xlm[WGS]; + __local real xlm[WGS1]; // Initializes the accumulation register - real acc[WPT]; + real acc[WPT1]; #pragma unroll - for (int w=0; w<WPT; ++w) { + for (int w=0; w<WPT1; ++w) { SetToZero(acc[w]); } // Divides the work in a main and tail section - const int n_tail = n % WGS; + const int n_tail = n % WGS1; const int n_floor = n - n_tail; // Loops over work-group sized portions of the work - for (int kwg=0; kwg<n_floor; kwg+=WGS) { + for (int kwg=0; kwg<n_floor; kwg+=WGS1) { // Loads the vector X into local memory const int lid = get_local_id(0); @@ -65,21 +86,21 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // Loops over the work per thread, and checks whether in bounds #pragma unroll - for (int w=0; w<WPT; ++w) { + for (int w=0; w<WPT1; ++w) { const int gid = w*get_global_size(0) + get_global_id(0); if (gid < m) { - // The multiply-add function for the main part (divisable by WGS) - if (a_transposed == 0) { // Not transposed + // The multiply-add function for the main part (divisable by WGS1) + if (a_rotated == 0) { // Not rotated #pragma unroll - for (int kl=0; kl<WGS; ++kl) { + for (int kl=0; kl<WGS1; ++kl) { const int k = kwg + kl; MultiplyAdd(acc[w], xlm[kl], agm[gid + a_ld*k + a_offset]); } } else { // Transposed #pragma unroll - for (int kl=0; kl<WGS; ++kl) { + for (int kl=0; kl<WGS1; ++kl) { const int k = kwg + kl; MultiplyAdd(acc[w], xlm[kl], agm[k + a_ld*gid + a_offset]); } @@ -93,12 +114,12 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // Loops over the work per thread, and checks whether in bounds #pragma unroll - for (int w=0; w<WPT; ++w) { + for (int w=0; w<WPT1; ++w) { const int gid = w*get_global_size(0) + get_global_id(0); if (gid < m) { - // The multiply-add function for the remainder part (not divisable by WGS) - if (a_transposed == 0) { // Not transposed + // The multiply-add function for the remainder part (not divisable by WGS1) + if (a_rotated == 0) { // Not rotated #pragma unroll for (int k=n_floor; k<n; ++k) { MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], agm[gid + a_ld*k + a_offset]); @@ -120,40 +141,41 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // ================================================================================================= // Data-widths for the 'fast' kernel -#if VW == 1 - typedef real realV; -#elif VW == 2 - typedef real2 realV; -#elif VW == 4 - typedef real4 realV; -#elif VW == 8 - typedef real8 realV; -#elif VW == 16 - typedef real16 realV; +#if VW2 == 1 + typedef real realVF; +#elif VW2 == 2 + typedef real2 realVF; +#elif VW2 == 4 + typedef real4 realVF; +#elif VW2 == 8 + typedef real8 realVF; +#elif VW2 == 16 + typedef real16 realVF; #endif // Faster version of the kernel, assuming that: -// --> 'm' and 'n' are multiples of WGS +// --> 'm' and 'n' are multiples of WGS2 // --> 'a_offset' is 0 -// --> 'a_ld' is a multiple of VW -__attribute__((reqd_work_group_size(WGS, 1, 1))) +// --> 'a_ld' is a multiple of VW2 +// --> 'a_rotated' is 0 +__attribute__((reqd_work_group_size(WGS2, 1, 1))) __kernel void XgemvFast(const int m, const int n, const real alpha, const real beta, - const int a_transposed, - const __global realV* restrict agm, const int a_offset, const int a_ld, + const int a_rotated, + const __global realVF* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { // Local memory for the vector X - __local real xlm[WGS]; + __local real xlm[WGS2]; // Initializes the accumulation register - real acc[WPT]; + real acc[WPT2]; #pragma unroll - for (int w=0; w<WPT; ++w) { + for (int w=0; w<WPT2; ++w) { SetToZero(acc[w]); } // Loops over work-group sized portions of the work - for (int kwg=0; kwg<n; kwg+=WGS) { + for (int kwg=0; kwg<n; kwg+=WGS2) { // Loads the vector X into local memory const int lid = get_local_id(0); @@ -162,102 +184,156 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b // Synchronizes all threads in a workgroup barrier(CLK_LOCAL_MEM_FENCE); - // The multiply-add function (not transposed) - if (a_transposed == 0) { + // The multiply-add function (not rotated) + #pragma unroll + for (int kl=0; kl<WGS2; ++kl) { + const int k = kwg + kl; #pragma unroll - for (int kl=0; kl<WGS; ++kl) { - const int k = kwg + kl; - #pragma unroll - for (int w=0; w<WPT/VW; ++w) { - const int gid = (WPT/VW)*get_global_id(0) + w; - #if VW == 1 - MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k]); - #elif VW == 2 - MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].x); - MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].y); - #elif VW == 4 - MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].x); - MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].y); - MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].z); - MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].w); - #elif VW == 8 - MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].s0); - MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].s1); - MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].s2); - MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].s3); - MultiplyAdd(acc[VW*w+4], xlm[kl], agm[gid + (a_ld/VW)*k].s4); - MultiplyAdd(acc[VW*w+5], xlm[kl], agm[gid + (a_ld/VW)*k].s5); - MultiplyAdd(acc[VW*w+6], xlm[kl], agm[gid + (a_ld/VW)*k].s6); - MultiplyAdd(acc[VW*w+7], xlm[kl], agm[gid + (a_ld/VW)*k].s7); - #elif VW == 16 - MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].s0); - MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].s1); - MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].s2); - MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].s3); - MultiplyAdd(acc[VW*w+4], xlm[kl], agm[gid + (a_ld/VW)*k].s4); - MultiplyAdd(acc[VW*w+5], xlm[kl], agm[gid + (a_ld/VW)*k].s5); - MultiplyAdd(acc[VW*w+6], xlm[kl], agm[gid + (a_ld/VW)*k].s6); - MultiplyAdd(acc[VW*w+7], xlm[kl], agm[gid + (a_ld/VW)*k].s7); - MultiplyAdd(acc[VW*w+8], xlm[kl], agm[gid + (a_ld/VW)*k].s8); - MultiplyAdd(acc[VW*w+9], xlm[kl], agm[gid + (a_ld/VW)*k].s9); - MultiplyAdd(acc[VW*w+10], xlm[kl], agm[gid + (a_ld/VW)*k].sA); - MultiplyAdd(acc[VW*w+11], xlm[kl], agm[gid + (a_ld/VW)*k].sB); - MultiplyAdd(acc[VW*w+12], xlm[kl], agm[gid + (a_ld/VW)*k].sC); - MultiplyAdd(acc[VW*w+13], xlm[kl], agm[gid + (a_ld/VW)*k].sD); - MultiplyAdd(acc[VW*w+14], xlm[kl], agm[gid + (a_ld/VW)*k].sE); - MultiplyAdd(acc[VW*w+15], xlm[kl], agm[gid + (a_ld/VW)*k].sF); - #endif - } + for (int w=0; w<WPT2/VW2; ++w) { + const int gid = (WPT2/VW2)*get_global_id(0) + w; + #if VW2 == 1 + MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k]); + #elif VW2 == 2 + MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].x); + MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].y); + #elif VW2 == 4 + MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].x); + MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].y); + MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].z); + MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].w); + #elif VW2 == 8 + MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].s0); + MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].s1); + MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].s2); + MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].s3); + MultiplyAdd(acc[VW2*w+4], xlm[kl], agm[gid + (a_ld/VW2)*k].s4); + MultiplyAdd(acc[VW2*w+5], xlm[kl], agm[gid + (a_ld/VW2)*k].s5); + MultiplyAdd(acc[VW2*w+6], xlm[kl], agm[gid + (a_ld/VW2)*k].s6); + MultiplyAdd(acc[VW2*w+7], xlm[kl], agm[gid + (a_ld/VW2)*k].s7); + #elif VW2 == 16 + MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].s0); + MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].s1); + MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].s2); + MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].s3); + MultiplyAdd(acc[VW2*w+4], xlm[kl], agm[gid + (a_ld/VW2)*k].s4); + MultiplyAdd(acc[VW2*w+5], xlm[kl], agm[gid + (a_ld/VW2)*k].s5); + MultiplyAdd(acc[VW2*w+6], xlm[kl], agm[gid + (a_ld/VW2)*k].s6); + MultiplyAdd(acc[VW2*w+7], xlm[kl], agm[gid + (a_ld/VW2)*k].s7); + MultiplyAdd(acc[VW2*w+8], xlm[kl], agm[gid + (a_ld/VW2)*k].s8); + MultiplyAdd(acc[VW2*w+9], xlm[kl], agm[gid + (a_ld/VW2)*k].s9); + MultiplyAdd(acc[VW2*w+10], xlm[kl], agm[gid + (a_ld/VW2)*k].sA); + MultiplyAdd(acc[VW2*w+11], xlm[kl], agm[gid + (a_ld/VW2)*k].sB); + MultiplyAdd(acc[VW2*w+12], xlm[kl], agm[gid + (a_ld/VW2)*k].sC); + MultiplyAdd(acc[VW2*w+13], xlm[kl], agm[gid + (a_ld/VW2)*k].sD); + MultiplyAdd(acc[VW2*w+14], xlm[kl], agm[gid + (a_ld/VW2)*k].sE); + MultiplyAdd(acc[VW2*w+15], xlm[kl], agm[gid + (a_ld/VW2)*k].sF); + #endif } } - // The multiply-add function (transposed) - else { + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the final result + #pragma unroll + for (int w=0; w<WPT2; ++w) { + const int gid = WPT2*get_global_id(0) + w; + AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]); + } +} + +// ================================================================================================= + +// Data-widths for the 'fast' kernel with rotated matrix +#if VW3 == 1 + typedef real realVFR; +#elif VW3 == 2 + typedef real2 realVFR; +#elif VW3 == 4 + typedef real4 realVFR; +#elif VW3 == 8 + typedef real8 realVFR; +#elif VW3 == 16 + typedef real16 realVFR; +#endif + +// Faster version of the kernel, assuming that: +// --> 'm' and 'n' are multiples of WGS3 +// --> 'a_offset' is 0 +// --> 'a_ld' is a multiple of VW3 +// --> 'a_rotated' is 1 +__attribute__((reqd_work_group_size(WGS3, 1, 1))) +__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta, + const int a_rotated, + const __global realVFR* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + // Local memory for the vector X + __local real xlm[WGS3]; + + // Initializes the accumulation register + real acc[WPT3]; + #pragma unroll + for (int w=0; w<WPT3; ++w) { + SetToZero(acc[w]); + } + + // Loops over work-group sized portions of the work + for (int kwg=0; kwg<n; kwg+=WGS3) { + + // Loads the vector X into local memory + const int lid = get_local_id(0); + xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset]; + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + // The multiply-add function (rotated) + #pragma unroll + for (int kl=0; kl<WGS3/VW3; ++kl) { + const int k = (kwg/VW3) + kl; #pragma unroll - for (int kl=0; kl<WGS/VW; ++kl) { - const int k = (kwg/VW) + kl; - #pragma unroll - for (int w=0; w<WPT; ++w) { - const int gid = WPT*get_global_id(0) + w; - realV avec = agm[k + (a_ld/VW)*gid]; - #if VW == 1 - MultiplyAdd(acc[w], xlm[VW*kl+0], avec); - #elif VW == 2 - MultiplyAdd(acc[w], xlm[VW*kl+0], avec.x); - MultiplyAdd(acc[w], xlm[VW*kl+1], avec.y); - #elif VW == 4 - MultiplyAdd(acc[w], xlm[VW*kl+0], avec.x); - MultiplyAdd(acc[w], xlm[VW*kl+1], avec.y); - MultiplyAdd(acc[w], xlm[VW*kl+2], avec.z); - MultiplyAdd(acc[w], xlm[VW*kl+3], avec.w); - #elif VW == 8 - MultiplyAdd(acc[w], xlm[VW*kl+0], avec.s0); - MultiplyAdd(acc[w], xlm[VW*kl+1], avec.s1); - MultiplyAdd(acc[w], xlm[VW*kl+2], avec.s2); - MultiplyAdd(acc[w], xlm[VW*kl+3], avec.s3); - MultiplyAdd(acc[w], xlm[VW*kl+4], avec.s4); - MultiplyAdd(acc[w], xlm[VW*kl+5], avec.s5); - MultiplyAdd(acc[w], xlm[VW*kl+6], avec.s6); - MultiplyAdd(acc[w], xlm[VW*kl+7], avec.s7); - #elif VW == 16 - MultiplyAdd(acc[w], xlm[VW*kl+0], avec.s0); - MultiplyAdd(acc[w], xlm[VW*kl+1], avec.s1); - MultiplyAdd(acc[w], xlm[VW*kl+2], avec.s2); - MultiplyAdd(acc[w], xlm[VW*kl+3], avec.s3); - MultiplyAdd(acc[w], xlm[VW*kl+4], avec.s4); - MultiplyAdd(acc[w], xlm[VW*kl+5], avec.s5); - MultiplyAdd(acc[w], xlm[VW*kl+6], avec.s6); - MultiplyAdd(acc[w], xlm[VW*kl+7], avec.s7); - MultiplyAdd(acc[w], xlm[VW*kl+8], avec.s8); - MultiplyAdd(acc[w], xlm[VW*kl+9], avec.s9); - MultiplyAdd(acc[w], xlm[VW*kl+10], avec.sA); - MultiplyAdd(acc[w], xlm[VW*kl+11], avec.sB); - MultiplyAdd(acc[w], xlm[VW*kl+12], avec.sC); - MultiplyAdd(acc[w], xlm[VW*kl+13], avec.sD); - MultiplyAdd(acc[w], xlm[VW*kl+14], avec.sE); - MultiplyAdd(acc[w], xlm[VW*kl+15], avec.sF); - #endif - } + for (int w=0; w<WPT3; ++w) { + const int gid = WPT3*get_global_id(0) + w; + realVFR avec = agm[k + (a_ld/VW3)*gid]; + #if VW3 == 1 + MultiplyAdd(acc[w], xlm[VW3*kl+0], avec); + #elif VW3 == 2 + MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x); + MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y); + #elif VW3 == 4 + MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x); + MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y); + MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z); + MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w); + #elif VW3 == 8 + MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0); + MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1); + MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2); + MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3); + MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4); + MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5); + MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6); + MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7); + #elif VW3 == 16 + MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0); + MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1); + MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2); + MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3); + MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4); + MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5); + MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6); + MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7); + MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8); + MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9); + MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA); + MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB); + MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC); + MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD); + MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE); + MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF); + #endif } } @@ -267,8 +343,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b // Stores the final result #pragma unroll - for (int w=0; w<WPT; ++w) { - const int gid = WPT*get_global_id(0) + w; + for (int w=0; w<WPT3; ++w) { + const int gid = WPT3*get_global_id(0) + w; AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]); } } diff --git a/src/routines/xgemv.cc b/src/routines/xgemv.cc index 74851ec9..9f3908f8 100644 --- a/src/routines/xgemv.cc +++ b/src/routines/xgemv.cc @@ -70,13 +70,30 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, if (ErrorIn(status)) { return status; } // Determines whether or not the fast-version can be used - bool use_fast_kernel = (a_offset == 0) && - IsMultiple(m, db_["WGS"]*db_["WPT"]) && - IsMultiple(n, db_["WGS"]) && - IsMultiple(a_ld, db_["VW"]); - - // If possible, run the fast-version of the kernel - auto kernel_name = (use_fast_kernel) ? "XgemvFast" : "Xgemv"; + bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && + IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && + IsMultiple(n, db_["WGS2"]) && + IsMultiple(a_ld, db_["VW2"]); + bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && + IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && + IsMultiple(n, db_["WGS3"]) && + IsMultiple(a_ld, db_["VW3"]); + + // If possible, run the fast-version (rotated or non-rotated) of the kernel + auto kernel_name = "Xgemv"; + auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); + auto global_size = m_ceiled / db_["WPT1"]; + auto local_size = db_["WGS1"]; + if (use_fast_kernel) { + kernel_name = "XgemvFast"; + global_size = m_real / db_["WPT2"]; + local_size = db_["WGS2"]; + } + if (use_fast_kernel_rot) { + kernel_name = "XgemvFastRot"; + global_size = m_real / db_["WPT3"]; + local_size = db_["WGS3"]; + } // Retrieves the Xgemv kernel from the compiled binary try { @@ -100,9 +117,8 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, kernel.SetArgument(13, static_cast<int>(y_inc)); // Launches the kernel - auto m_ceiled = Ceil(m_real, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{m_ceiled / db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; + auto global = std::vector<size_t>{global_size}; + auto local = std::vector<size_t>{local_size}; status = RunKernel(kernel, global, local); if (ErrorIn(status)) { return status; } diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc index d617af88..2dcb11d5 100644 --- a/src/tuning/tuning.cc +++ b/src/tuning/tuning.cc @@ -75,7 +75,8 @@ template void TunerXY<double2>(int, char**, const Tuner2<double2>&); // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect // the results. Used for matrix-vector-vector routines. template <typename T> -void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) { +void TunerAXY(int argc, char* argv[], const size_t num_variations, + const Tuner3V<T> &tune_function) { // Sets the parameters and platform/device for which to tune (command-line options) auto help = std::string{"* Options given/available:\n"}; @@ -83,11 +84,10 @@ void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) { args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); - args.m = GetArgument(argc, argv, help, kArgM, size_t{1024}); - args.n = GetArgument(argc, argv, help, kArgN, size_t{1024}); + args.m = GetArgument(argc, argv, help, kArgM, size_t{2048}); + args.n = GetArgument(argc, argv, help, kArgN, size_t{2048}); args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); - args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kColMajor); fprintf(stdout, "%s\n", help.c_str()); // Creates input buffers with random data @@ -98,36 +98,40 @@ void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) { PopulateVector(x_vec); PopulateVector(y_vec); - // Initializes the tuner for the chosen device - cltune::Tuner tuner(args.platform_id, args.device_id); + // Loop over the different variations of the kernel + for (auto variation=size_t{1}; variation<=num_variations; ++variation) { - // Use full-search to explore all parameter combinations. - tuner.UseFullSearch(); + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); - // Configures the tuning parameters (kernel specific) - tune_function(args, a_mat, x_vec, y_vec, tuner); + // Use full-search to explore all parameter combinations. + tuner.UseFullSearch(); - // Starts the tuning process - tuner.Tune(); + // Configures the tuning parameters (kernel specific) + tune_function(args, variation, a_mat, x_vec, y_vec, tuner); - // Prints the results to screen - auto time_ms = tuner.PrintToScreen(); - tuner.PrintFormatted(); + // Starts the tuning process + tuner.Tune(); - // Also prints the performance of the best-case in terms of GB/s and GFLOPS - const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6; - const auto mega_flops = (2*args.m*args.n) * 1.0e-6; - if (time_ms != 0.0) { - printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n", - time_ms, mega_bytes/time_ms, mega_flops/time_ms); + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GB/s and GFLOPS + const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6; + const auto mega_flops = (2*args.m*args.n) * 1.0e-6; + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n", + time_ms, mega_bytes/time_ms, mega_flops/time_ms); + } } } // Compiles the above function -template void TunerAXY<float>(int, char**, const Tuner3<float>&); -template void TunerAXY<double>(int, char**, const Tuner3<double>&); -template void TunerAXY<float2>(int, char**, const Tuner3<float2>&); -template void TunerAXY<double2>(int, char**, const Tuner3<double2>&); +template void TunerAXY<float>(int, char**, const size_t, const Tuner3V<float>&); +template void TunerAXY<double>(int, char**, const size_t, const Tuner3V<double>&); +template void TunerAXY<float2>(int, char**, const size_t, const Tuner3V<float2>&); +template void TunerAXY<double2>(int, char**, const size_t, const Tuner3V<double2>&); // ================================================================================================= diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index e2d54729..dccd250c 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -8,6 +8,10 @@ // Cedric Nugteren <www.cedricnugteren.nl> // // This file implements an auto-tuner to tune the Xgemv OpenCL kernel. It uses the CLTune library. +// Three variations of the kernel are tuned: +// 1: The full version of the kernel +// 2: The fast version for non-transposed matrices +// 3: The fast version for transposed matrices // // ================================================================================================= @@ -23,43 +27,60 @@ namespace clblast { // The Xgemv auto-tuner template <typename T> -void XgemvTune(const Arguments<T> &args, +void XgemvTune(const Arguments<T> &args, const size_t variation, const std::vector<T> &a_mat, const std::vector<T> &x_vec, std::vector<T> &y_vec, cltune::Tuner &tuner) { + // Sets the kernel name and the layout argument + auto kernel_name = (variation == 1) ? "Xgemv" : ((variation == 2) ? "XgemvFast" : "XgemvFastRot"); + auto a_rotated = (variation == 3) ? 1 : 0; + // This points to the Xgemv kernel as found in the CLBlast library std::string common_source = #include "../src/kernels/common.opencl" std::string kernel_source = #include "../src/kernels/xgemv.opencl" auto sources = common_source + kernel_source; - auto id = tuner.AddKernelFromString(sources, "XgemvFast", {args.m}, {1}); + auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1}); tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64}); - // Sets the tunable parameters and their possible values - tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048}); - tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "VW", {1, 2, 4, 8}); + // Helper for the constraints + auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; + + // Sets the tunable parameters, their possible values, the adjusted thread sizes, and constraints + if (variation == 1) { + tuner.AddParameter(id, "WGS1", {64, 128, 256, 512, 1024, 1536, 2048}); + tuner.AddParameter(id, "WPT1", {1, 2, 4, 8}); + tuner.MulLocalSize(id, {"WGS1"}); + tuner.DivGlobalSize(id, {"WPT1"}); + } + else if (variation == 2) { + tuner.AddParameter(id, "WGS2", {64, 128, 256, 512, 1024, 1536, 2048}); + tuner.AddParameter(id, "WPT2", {1, 2, 4, 8}); + tuner.AddParameter(id, "VW2", {1, 2, 4, 8}); + tuner.MulLocalSize(id, {"WGS2"}); + tuner.DivGlobalSize(id, {"WPT2"}); + tuner.AddConstraint(id, MultipleOfX, {"WPT2", "VW2"}); + } + else if (variation == 3) { + tuner.AddParameter(id, "WGS3", {64, 128, 256, 512, 1024, 1536, 2048}); + tuner.AddParameter(id, "WPT3", {1, 2, 4, 8}); + tuner.AddParameter(id, "VW3", {1, 2, 4, 8}); + tuner.MulLocalSize(id, {"WGS3"}); + tuner.DivGlobalSize(id, {"WPT3"}); + tuner.AddConstraint(id, MultipleOfX, {"WGS3", "VW3"}); + } // Tests for a specific precision tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); - // Sets the constraints - auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; - tuner.AddConstraint(id, MultipleOfX, {"WGS", "VW"}); - tuner.AddConstraint(id, MultipleOfX, {"WPT", "VW"}); - - // Modifies the thread-sizes (local) based on the parameters - tuner.MulLocalSize(id, {"WGS"}); - tuner.DivGlobalSize(id, {"WPT"}); - // Sets the function's arguments tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(args.alpha); tuner.AddArgumentScalar(args.beta); - tuner.AddArgumentScalar(static_cast<int>(args.layout)); + tuner.AddArgumentScalar(static_cast<int>(a_rotated)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(static_cast<int>(args.m)); @@ -75,12 +96,13 @@ void XgemvTune(const Arguments<T> &args, // Main function which calls the common client code with the routine-specific function as argument. void TunerXgemv(int argc, char *argv[]) { + auto num_variations = size_t{3}; switch(GetPrecision(argc, argv)) { case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case Precision::kSingle: TunerAXY<float>(argc, argv, XgemvTune<float>); break; - case Precision::kDouble: TunerAXY<double>(argc, argv, XgemvTune<double>); break; - case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, XgemvTune<float2>); break; - case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, XgemvTune<double2>); break; + case Precision::kSingle: TunerAXY<float>(argc, argv, num_variations, XgemvTune<float>); break; + case Precision::kDouble: TunerAXY<double>(argc, argv, num_variations, XgemvTune<double>); break; + case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, num_variations, XgemvTune<float2>); break; + case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, num_variations, XgemvTune<double2>); break; } } |